Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/kpu.c@ 453

Last change on this file since 453 was 453, checked in by coas-nagasima, 4 years ago
ファイルを追加
Property svn:eol-style set to `native` Property svn:mime-type set to `text/x-csrc;charset=UTF-8`
File size: 64.2 KB

Rev	Line
[453]	1	#include <assert.h>
	2	#include <float.h>
	3	#include <math.h>
	4	#include <stdio.h>
	5	#include <stdlib.h>
	6	#include <string.h>
	7	#include <stdint.h>
	8	#include <kernel.h>
	9	#include <t_syslog.h>
	10	#include <t_stdlib.h>
	11	#include <kernel_impl.h>
	12	#include <target_syssvc.h>
	13	#include "kendryte-k210.h"
	14	#include "device.h"
	15	#include "atomic.h"
	16	#include "kpu.h"
	17	#include "utils.h"
	18	#include "kpu_main.h"
	19
	20	#define sil_orw_mem(a, b) sil_wrw_mem((a), sil_rew_mem(a) \| (b))
	21
	22	void sysctl_enable_irq(void)
	23	{
	24	set_csr(mie, MIP_MEIP);
	25	set_csr(mstatus, MSTATUS_MIE);
	26	}
	27
	28	void sysctl_disable_irq(void)
	29	{
	30	clear_csr(mie, MIP_MEIP);
	31	clear_csr(mstatus, MSTATUS_MIE);
	32	}
	33
	34	uint64_t sysctl_get_time_us(void)
	35	{
	36	uint64_t v_cycle = read_cycle();
	37	return v_cycle * 1000000 / SYSCTRL_CLOCK_FREQ_IN0;
	38	}
	39
	40	static int is_memory(uintptr_t address)
	41	{
	42	enum
	43	{
	44	mem_len = 6 * 1024 * 1024,
	45	mem_no_cache_len = 8 * 1024 * 1024,
	46	};
	47	return ((address >= 0x80000000) && (address < 0x80000000 + mem_len)) \|\| ((address >= 0x40000000) && (address < 0x40000000 + mem_no_cache_len)) \|\| (address == 0x50450040);
	48	}
	49
	50	uint32_t is_memory_cache(uintptr_t address)
	51	{
	52	#define MEM_CACHE_LEN (6 * 1024 * 1024)
	53
	54	return ((address >= 0x80000000) && (address < 0x80000000 + MEM_CACHE_LEN));
	55	}
	56
	57	int plic_irq_enable(INTNO irq_number)
	58	{
	59	if (irq_number != INTNO_AI)
	60	return -1;
	61	ena_int(irq_number);
	62	return 0;
	63	}
	64
	65	int plic_set_priority(INTNO irq_number, uint32_t priority)
	66	{
	67	if (irq_number != INTNO_AI)
	68	return -1;
	69	set_ipriority(irq_number, priority);
	70	return 0;
	71	}
	72
	73	plic_irq_callback_t ai_done_callback;
	74	void *ai_done_ctx;
	75
	76	void plic_irq_register(INTNO irq, plic_irq_callback_t callback, void *ctx)
	77	{
	78	ER ret;
	79	if (irq != INTNO_AI)
	80	return;
	81
	82	ret = loc_cpu();
	83
	84	ai_done_callback = callback;
	85	ai_done_ctx = ctx;
	86
	87	if (ret == E_OK)
	88	unl_cpu();
	89	}
	90
	91	void ai_done_isr(intptr_t exinf)
	92	{
	93	sysctl_disable_irq();
	94	if (ai_done_callback != NULL){
	95	ai_done_callback(ai_done_ctx);
	96	}
	97	sysctl_enable_irq();
	98	}
	99
	100	plic_irq_callback_t ai_dma_done_callback;
	101	void *ai_dma_done_ctx;
	102
	103	void kpu_dmac_irq_register(dmac_channel_number_t channel_num,
	104	plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
	105	{
	106	ER ret;
	107	if (channel_num != AI_DMA_CH)
	108	return;
	109
	110	set_ipriority(INTNO_DMAAI, priority);
	111
	112	ret = loc_cpu();
	113
	114	ai_dma_done_callback = dmac_callback;
	115	ai_dma_done_ctx = ctx;
	116
	117	if (ret == E_OK)
	118	unl_cpu();
	119	}
	120
	121	void ai_dma_done_isr(DMA_Handle_t *dma)
	122	{
	123	sysctl_disable_irq();
	124	if (ai_dma_done_callback != NULL) {
	125	ai_dma_done_callback(ai_dma_done_ctx);
	126	}
	127	sysctl_enable_irq();
	128	}
	129
	130	void dmac_set_irq(dmac_channel_number_t channel_num,
	131	plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
	132	{
	133	ER ret;
	134	if (channel_num != AI_DMA_CH)
	135	return;
	136
	137	set_ipriority(INTNO_DMAAI, priority);
	138
	139	ret = loc_cpu();
	140
	141	ai_dma_done_callback = dmac_callback;
	142	ai_dma_done_ctx = ctx;
	143
	144	if (ret == E_OK)
	145	unl_cpu();
	146	}
	147
	148	DMA_Handle_t g_ai_hdma;
	149
	150	void dmac_set_single_mode(dmac_channel_number_t channel_num,
	151	const void src, void dest, uint8_t src_inc,
	152	uint8_t dest_inc,
	153	uint8_t dmac_burst_size,
	154	uint8_t dmac_trans_width,
	155	size_t block_size)
	156	{
	157	if (channel_num != AI_DMA_CH)
	158	return;
	159
	160	DMA_Handle_t *hdma = &g_ai_hdma;
	161	int mem_type_src = is_memory((uintptr_t)src), mem_type_dest = is_memory((uintptr_t)dest);
	162	uint8_t flow_control;
	163	if(mem_type_src == 0 && mem_type_dest == 0)
	164	flow_control = DMA_PERIPH_TO_PERIPH;
	165	else if(mem_type_src == 1 && mem_type_dest == 0)
	166	flow_control = DMA_MEMORY_TO_PERIPH;
	167	else if(mem_type_src == 0 && mem_type_dest == 1)
	168	flow_control = DMA_PERIPH_TO_MEMORY;
	169	else
	170	flow_control = DMA_MEMORY_TO_MEMORY;
	171
	172	hdma->Init.Direction = flow_control; /* DMA転送方向 */
	173	hdma->Init.SrcHandShake = (mem_type_src ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* ソースハンドシェイク */
	174	hdma->Init.DrcHandShake = (mem_type_dest ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* デスティネーションハンドシェイク */
	175	hdma->Init.SrcInc = src_inc; /* ソースインクリメント設定 */
	176	hdma->Init.DstInc = dest_inc; /* デスティネーションインクリメント設定 */
	177	hdma->Init.SrcTransWidth = dmac_trans_width; /* ソース転送幅 */
	178	hdma->Init.DstTransWidth = dmac_trans_width; /* デスティネーション転送幅 */
	179	hdma->Init.SrcBurstSize = dmac_burst_size; /* ソースバーストサイズ */
	180	hdma->Init.DstBurstSize = dmac_burst_size; /* デスティネーションバーストサイズ */
	181	dma_reset(hdma);
	182
	183	dma_start(hdma, (uintptr_t)src, (uintptr_t)dest, block_size);
	184	}
	185
	186	#define LAYER_BURST_SIZE 12
	187
	188	#define KPU_DEBUG 0
	189	#define USE_CACHED_AI_RAM 0
	190
	191	#define min(a, b) (((a) < (b)) ? (a) : (b))
	192	#define max(a, b) (((a) > (b)) ? (a) : (b))
	193	#define ALIGN_UP(x, align) ((x + (align - 1)) & (~(align - 1)))
	194
	195	static int ai_step(void *userdata);
	196	static int kpu_kmodel_done(kpu_model_context_t *ctx);
	197
	198	volatile kpu_config_t const kpu = (volatile kpu_config_t )AI_BASE_ADDR;
	199	static volatile uint32_t kpu_status;
	200
	201	typedef struct kpu_context
	202	{
	203	kpu_task_t kpu_task;
	204	uint32_t kpu_status;
	205	} kpu_context_t;
	206
	207	volatile kpu_context_t g_kpu_context;
	208
	209	static int kpu_run_all_done(void *_task)
	210	{
	211	atomic_swap(&g_kpu_context.kpu_status, 0);
	212	kpu_task_t task = (kpu_task_t )_task;
	213	task->callback(task);
	214	return 0;
	215	}
	216
	217	int kpu_continue(void *_task)
	218	{
	219	kpu_task_t task = (kpu_task_t )_task;
	220	int layer_burst_size = 1;
	221
	222	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
	223	.calc_done_int = 1,
	224	.layer_cfg_almost_empty_int = 1,
	225	.layer_cfg_almost_full_int = 1};
	226
	227	if(task->remain_layers_length == 0)
	228	{
	229	return 0;
	230	}
	231	if(task->remain_layers_length <= layer_burst_size)
	232	{
	233	for(uint32_t i = 0; i < task->remain_layers_length; i++)
	234	{
	235	kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
	236	kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
	237	kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
	238	kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
	239	kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
	240	kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
	241	kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
	242	kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
	243	kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
	244	kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
	245	kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
	246	kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
	247	}
	248	task->remain_layers_length = 0;
	249	} else
	250	{
	251	for(uint32_t i = 0; i < layer_burst_size; i++)
	252	{
	253	kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
	254	kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
	255	kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
	256	kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
	257	kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
	258	kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
	259	kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
	260	kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
	261	kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
	262	kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
	263	kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
	264	kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
	265	}
	266	task->remain_layers += layer_burst_size;
	267	task->remain_layers_length -= layer_burst_size;
	268	}
	269	return 0;
	270	}
	271
	272	static int kpu_run_dma_output(uint32_t dma_ch, void dst, uint32_t length, plic_irq_callback_t cb, void _task)
	273	{
	274	select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
	275	kpu_dmac_irq_register(dma_ch, kpu_run_all_done, _task, 1);
	276	dmac_set_single_mode(dma_ch, (void )(&kpu->fifo_data_out), (void )(dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
	277	DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (length + 7) / 8);
	278	return 0;
	279	}
	280
	281	static int kpu_run_dma_input_done_push_layers(void *_task)
	282	{
	283	kpu_task_t task = (kpu_task_t )_task;
	284	kpu->interrupt_clear.reg = 7;
	285	dma_end(&g_ai_hdma);
	286	kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
	287	.fifo_full_threshold = 10, .fifo_empty_threshold = 1};
	288	kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
	289	.eight_bit_mode = task->eight_bit_mode};
	290
	291	kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
	292
	293	kpu_run_dma_output(task->dma_ch, task->dst, last_layer->dma_parameter.data.dma_total_byte + 1, kpu_run_all_done, task);
	294
	295	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
	296	.calc_done_int = 0,
	297	.layer_cfg_almost_empty_int = 0,
	298	.layer_cfg_almost_full_int = 1};
	299	kpu_continue(task);
	300	return 0;
	301	}
	302
	303	static void kpu_run_dma_input(uint32_t dma_ch, const void src, plic_irq_callback_t cb, void _task)
	304	{
	305	kpu_task_t *task = _task;
	306	kpu_layer_argument_t *first_layer = &task->layers[0];
	307	uint64_t input_size = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1);
	308	kpu_dmac_irq_register(dma_ch, cb, _task, 1);
	309	dmac_set_single_mode(dma_ch, (void )src, (void )(AI_IO_BASE_ADDR), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
	310	DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
	311	}
	312
	313	int kpu_run(kpu_task_t v_task, dmac_channel_number_t dma_ch, const void src, void *dest, plic_irq_callback_t callback)
	314	{
	315	if(atomic_cas(&g_kpu_context.kpu_status, 0, 1))
	316	return -1;
	317
	318	memcpy((void *)&g_kpu_context.kpu_task, v_task, sizeof(kpu_task_t));
	319	kpu_task_t task = (kpu_task_t )&g_kpu_context.kpu_task;
	320
	321	kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
	322
	323	uint64_t output_size = last_layer->dma_parameter.data.dma_total_byte + 1;
	324
	325	last_layer->dma_parameter.data.send_data_out = 1;
	326	last_layer->interrupt_enabe.data.int_en = 1;
	327
	328	task->dma_ch = dma_ch;
	329	task->dst = dest;
	330	task->dst_length = output_size;
	331	task->callback = callback;
	332	task->remain_layers_length = task->layers_length;
	333	task->remain_layers = task->layers;
	334
	335	plic_set_priority(INTNO_AI, 1);
	336	plic_irq_register(INTNO_AI, kpu_continue, task);
	337	plic_irq_enable(INTNO_AI);
	338
	339	kpu_run_dma_input(dma_ch, src, kpu_run_dma_input_done_push_layers, task);
	340
	341	return 0;
	342	}
	343
	344	uint8_t kpu_get_output_buf(kpu_task_t task)
	345	{
	346	kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
	347	size_t output_size = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8 * 8;
	348	return malloc(output_size);
	349	}
	350
	351	void kpu_release_output_buf(uint8_t *output_buf)
	352	{
	353	if(output_buf != NULL)
	354	free(output_buf);
	355	}
	356
	357	static int kpu_done(void *ctx)
	358	{
	359	atomic_swap(&kpu_status, 0);
	360	kpu_task_t task = (kpu_task_t )ctx;
	361	task->callback(task->ctx);
	362	return 0;
	363	}
	364
	365	static int kpu_config_input(void *ctx)
	366	{
	367	kpu_task_t task = (kpu_task_t )ctx;
	368	kpu->interrupt_clear.reg = 7;
	369	if(task->remain_layers_length <= LAYER_BURST_SIZE)
	370	{
	371	for(uint32_t i = 0; i < task->remain_layers_length; i++)
	372	{
	373	kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
	374	kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
	375	kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
	376	kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
	377	kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
	378	kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
	379	kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
	380	kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
	381	kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
	382	kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
	383	kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
	384	kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
	385	}
	386	task->remain_layers_length = 0;
	387	kpu->interrupt_mask.reg = 7;
	388	} else
	389	{
	390	for(uint32_t i = 0; i < LAYER_BURST_SIZE; i++)
	391	{
	392	kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
	393	kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
	394	kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
	395	kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
	396	kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
	397	kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
	398	kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
	399	kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
	400	kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
	401	kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
	402	kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
	403	kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
	404	}
	405	task->remain_layers += LAYER_BURST_SIZE;
	406	task->remain_layers_length -= LAYER_BURST_SIZE;
	407	}
	408	return 0;
	409	}
	410
	411	static void kpu_data_output(kpu_task_t *task)
	412	{
	413	select_dma_channel(task->dma_ch, DMA_SELECT_AI_RX_REQ);
	414	kpu_dmac_irq_register(task->dma_ch, kpu_done, task, 1);
	415	dmac_set_single_mode(task->dma_ch, (void )(&kpu->fifo_data_out), (void )(task->dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
	416	DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, task->dst_length);
	417	}
	418
	419	static int kpu_data_ready(void *ctx)
	420	{
	421	kpu_task_t task = (kpu_task_t )ctx;
	422
	423	dma_end(&g_ai_hdma);
	424	kpu_data_output(task);
	425
	426	kpu->eight_bit_mode.reg = task->eight_bit_mode;
	427	kpu->interrupt_mask.reg = 7;
	428	kpu->interrupt_clear.reg = 7;
	429	kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
	430	.fifo_full_threshold = 12, .fifo_empty_threshold = 1};
	431
	432	plic_set_priority(INTNO_AI, 2);
	433	plic_irq_register(INTNO_AI, kpu_config_input, task);
	434	plic_irq_enable(INTNO_AI);
	435	kpu_config_input(task);
	436	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
	437	.calc_done_int = 1,
	438	.layer_cfg_almost_empty_int = 0,
	439	.layer_cfg_almost_full_int = 1};
	440	return 0;
	441	}
	442
	443	static void kpu_data_input(kpu_task_t *task)
	444	{
	445	if(task->src == NULL)
	446	{
	447	kpu_data_ready(task);
	448	return;
	449	}
	450	kpu_dmac_irq_register(task->dma_ch, kpu_data_ready, task, 1);
	451	kpu_layer_argument_t *layer = &task->layers[0];
	452	dmac_set_single_mode(task->dma_ch, (void )(uintptr_t)task->src, (void )(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
	453	DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, task->src_length);
	454	}
	455
	456	int kpu_single_task_init(kpu_task_t *task)
	457	{
	458	/*
	459	* AIクロック有効化
	460	*/
	461	sil_orw_mem((uint32_t *)(TADR_SYSCTL_BASE+TOFF_SYSCTL_CLK_EN_PERI), SYSCTL_CLK_EN_PERI_AI_CLK_EN);
	462
	463	kpu_layer_argument_t *first_layer = &task->layers[0];
	464	kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
	465
	466	last_layer->dma_parameter.data.send_data_out = 1;
	467	last_layer->interrupt_enabe.data.int_en = 1;
	468	task->src_length = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1) / 8;
	469	task->dst_length = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8;
	470	task->dst = (uint64_t )malloc(task->dst_length 8);
	471	if(task->dst == NULL)
	472	return 1;
	473	memset(task->dst, 0, task->dst_length * 8);
	474	return 0;
	475	}
	476
	477	int kpu_single_task_deinit(kpu_task_t *task)
	478	{
	479	free(task->dst);
	480	return 0;
	481	}
	482
	483	int kpu_model_load_from_buffer(kpu_task_t task, uint8_t buffer, kpu_model_layer_metadata_t **meta)
	484	{
	485	uintptr_t base_addr = (uintptr_t)buffer;
	486	kpu_model_header_t header = (kpu_model_header_t )buffer;
	487	kpu_model_layer_metadata_t layer_meta = (kpu_model_layer_metadata_t )(base_addr + sizeof(kpu_model_header_t));
	488	kpu_layer_argument_t layers = (kpu_layer_argument_t )(base_addr + header->layers_argument_start);
	489
	490	if(header->version != 1)
	491	return -1;
	492	uint32_t layers_length = header->layers_length;
	493	task->layers_length = layers_length;
	494	task->eight_bit_mode = header->flags & 1;
	495	task->layers = layers;
	496	task->output_scale = layer_meta[layers_length - 1].output_scale;
	497	task->output_bias = layer_meta[layers_length - 1].output_bias;
	498	size_t i;
	499	for(i = 0; i < layers_length; i++)
	500	{
	501	layers[i].kernel_load_cfg.data.para_start_addr = (uint64_t)(base_addr + layer_meta[i].weigths_offset);
	502	layers[i].kernel_pool_type_cfg.data.bwsx_base_addr = (uint64_t)(base_addr + layer_meta[i].bn_offset);
	503	layers[i].kernel_calc_type_cfg.data.active_addr = (uint64_t)(base_addr + layer_meta[i].act_offset);
	504	}
	505
	506	if(meta)
	507	*meta = layer_meta;
	508	return 0;
	509	}
	510
	511	int kpu_start(kpu_task_t *task)
	512	{
	513	if(atomic_cas(&kpu_status, 0, 1))
	514	return -1;
	515
	516	task->remain_layers_length = task->layers_length;
	517	task->remain_layers = task->layers;
	518	kpu_data_input(task);
	519	return 0;
	520	}
	521
	522	static void kpu_send_layer(const kpu_layer_argument_t *layer)
	523	{
	524	kpu->layer_argument_fifo = layer->interrupt_enabe.reg;
	525	kpu->layer_argument_fifo = layer->image_addr.reg;
	526	kpu->layer_argument_fifo = layer->image_channel_num.reg;
	527	kpu->layer_argument_fifo = layer->image_size.reg;
	528	kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg;
	529	kpu->layer_argument_fifo = layer->kernel_load_cfg.reg;
	530	kpu->layer_argument_fifo = layer->kernel_offset.reg;
	531	kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg;
	532	kpu->layer_argument_fifo = layer->write_back_cfg.reg;
	533	kpu->layer_argument_fifo = layer->conv_value.reg;
	534	kpu->layer_argument_fifo = layer->conv_value2.reg;
	535	kpu->layer_argument_fifo = layer->dma_parameter.reg;
	536	}
	537
	538	void kpu_init(int eight_bit_mode, plic_irq_callback_t callback, void *userdata)
	539	{
	540	kpu->interrupt_clear.reg = 7;
	541	kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
	542	.fifo_full_threshold = 10, .fifo_empty_threshold = 1};
	543	kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
	544	.eight_bit_mode = eight_bit_mode};
	545	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
	546	.calc_done_int = 1,
	547	.layer_cfg_almost_empty_int = 0,
	548	.layer_cfg_almost_full_int = 1};
	549
	550	plic_set_priority(INTNO_AI, 1);
	551	plic_irq_register(INTNO_AI, callback, userdata);
	552	plic_irq_enable(INTNO_AI);
	553	}
	554
	555	void kpu_input_dma(const kpu_layer_argument_t layer, const uint8_t src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata)
	556	{
	557	uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1);
	558	dmac_set_irq(dma_ch, callback, userdata, 1);
	559	dmac_set_single_mode(dma_ch, (void )src, (void )(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
	560	DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
	561	}
	562
	563	static void kpu_conv2d_core(kpu_layer_argument_t *layer)
	564	{
	565	kpu_send_layer(layer);
	566	}
	567
	568	void kpu_conv2d(kpu_layer_argument_t *layer)
	569	{
	570	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
	571	.calc_done_int = 1,
	572	.layer_cfg_almost_empty_int = 1,
	573	.layer_cfg_almost_full_int = 1};
	574	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
	575	.calc_done_int = 1,
	576	.layer_cfg_almost_empty_int = 0,
	577	.layer_cfg_almost_full_int = 1};
	578	kpu_conv2d_core(layer);
	579	}
	580
	581	void kpu_conv2d_output(kpu_layer_argument_t layer, dmac_channel_number_t dma_ch, uint8_t dest, plic_irq_callback_t callback, void *userdata)
	582	{
	583	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
	584	.calc_done_int = 1,
	585	.layer_cfg_almost_empty_int = 1,
	586	.layer_cfg_almost_full_int = 1};
	587	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
	588	.calc_done_int = 1,
	589	.layer_cfg_almost_empty_int = 1,
	590	.layer_cfg_almost_full_int = 1};
	591	layer->dma_parameter.data.send_data_out = 1;
	592	select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
	593	dmac_set_irq(dma_ch, callback, userdata, 1);
	594	dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
	595	DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer->dma_parameter.data.dma_total_byte + 8) / 8);
	596	kpu_conv2d_core(layer);
	597	}
	598
	599	void kpu_conv2d_output_full_add(kpu_layer_argument_t layer, dmac_channel_number_t dma_ch, uint64_t dest, plic_irq_callback_t callback, void *userdata)
	600	{
	601	uint32_t channels = layer->image_channel_num.data.o_ch_num + 1;
	602	layer->interrupt_enabe.data.full_add = 1;
	603
	604	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
	605	.calc_done_int = 1,
	606	.layer_cfg_almost_empty_int = 1,
	607	.layer_cfg_almost_full_int = 1};
	608	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
	609	.calc_done_int = 1,
	610	.layer_cfg_almost_empty_int = 1,
	611	.layer_cfg_almost_full_int = 1};
	612	layer->dma_parameter.data.send_data_out = 1;
	613	select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
	614	dmac_set_irq(dma_ch, callback, userdata, 1);
	615	dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
	616	DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, channels);
	617	kpu_conv2d_core(layer);
	618	}
	619
	620	void kpu_add(const uint8_t src1, const quantize_param_t src1_param, const uint8_t src2, const quantize_param_t src2_param, size_t count, uint8_t dest, const quantize_param_t dest_param)
	621	{
	622	quantize_param_t q1 = src1_param, q2 = src2_param, q3 = *dest_param;
	623
	624	size_t i;
	625	for(i = 0; i < count; i++)
	626	{
	627	int value = ((src1++ q1.scale + q1.bias + src2++ q2.scale + q2.bias) - q3.bias) / q3.scale;
	628	if(value < 0)
	629	value = 0;
	630	if(value > 0xFF)
	631	value = 0xFF;
	632	*dest++ = value;
	633	}
	634	}
	635
	636	void kpu_global_average_pool(const uint8_t src, const quantize_param_t src_param, int kernel_size, int channels, uint8_t dest, const quantize_param_t dest_param)
	637	{
	638	quantize_param_t q1 = src_param, q2 = dest_param;
	639	size_t oc, y, x;
	640
	641	if(((uintptr_t)dest) >= AI_IO_BASE_ADDR && ((uintptr_t)dest) < AI_IO_BASE_ADDR + 2 * 1024 * 1024)
	642	{
	643	uint32_t row_padding = 16;
	644	uint32_t row_group = 4;
	645	uint32_t row_length = 1;
	646	uint32_t height = 4;
	647
	648	for(oc = 0; oc < channels; oc++)
	649	{
	650	uint8_t channel_origin = dest + oc / row_group row_length * height * 64 + oc % row_group * row_padding;
	651	for(y = 0; y < 1; y++)
	652	{
	653	uint8_t y_origin = channel_origin + y row_length * 64;
	654	for(x = 0; x < 1; x++)
	655	{
	656	int64_t sum = 0;
	657	size_t i;
	658	for(i = 0; i < kernel_size; i++)
	659	sum += *src++;
	660
	661	int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
	662	if(value < 0)
	663	value = 0;
	664	if(value > 0xFF)
	665	value = 0xFF;
	666	y_origin[x] = value;
	667	}
	668	}
	669	}
	670	} else
	671	{
	672	for(oc = 0; oc < channels; oc++)
	673	{
	674	int64_t sum = 0;
	675	size_t i;
	676	for(i = 0; i < kernel_size; i++)
	677	sum += *src++;
	678
	679	int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
	680	if(value < 0)
	681	value = 0;
	682	if(value > 0xFF)
	683	value = 0xFF;
	684	dest[oc] = value;
	685	}
	686	}
	687	}
	688
	689	void kpu_global_average_pool_float(const uint8_t src, const quantize_param_t src_param, int kernel_size, int channels, float *dest)
	690	{
	691	quantize_param_t q = *src_param;
	692	size_t oc;
	693
	694	for(oc = 0; oc < channels; oc++)
	695	{
	696	int64_t sum = 0;
	697	size_t i;
	698	for(i = 0; i < kernel_size; i++)
	699	sum += *src++;
	700
	701	float value = (sum * q.scale + q.bias) / kernel_size;
	702	dest[oc] = value;
	703	}
	704	}
	705
	706	void kpu_matmul_end(const uint8_t src, int channels, float dest, const quantize_param_t *dest_param)
	707	{
	708	quantize_param_t q1 = *dest_param;
	709	size_t i = 0;
	710	for(i = 0; i < channels; i++)
	711	dest++ = src[i 16] * q1.scale + q1.bias;
	712	}
	713
	714	void kpu_fully_connected(const float src, const float weights, const float biases, float dest, int input_channels, int output_channels)
	715	{
	716	int ic, oc;
	717	for(oc = 0; oc < output_channels; oc++)
	718	{
	719	const float c_weights = weights + oc input_channels;
	720
	721	float sum = 0.0f;
	722	for(ic = 0; ic < input_channels; ic++)
	723	sum += src[ic] * c_weights[ic];
	724	dest[oc] = sum + biases[oc];
	725	}
	726	}
	727
	728	void kpu_dequantize(const uint8_t src, const quantize_param_t src_param, size_t count, float *dest)
	729	{
	730	quantize_param_t q1 = *src_param;
	731	size_t i = 0;
	732	for(i = 0; i < count; i++)
	733	dest++ = src[i] q1.scale + q1.bias;
	734	}
	735
	736	void kpu_input_with_padding(kpu_layer_argument_t layer, const uint8_t src, int width, int height, int channels)
	737	{
	738	uint8_t dest = (uint8_t )(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64);
	739	size_t oc, y, x;
	740
	741	uint32_t row_padding;
	742	uint32_t row_group;
	743	uint32_t row_length;
	744
	745	if(width <= 16)
	746	{
	747	row_padding = 16;
	748	row_group = 4;
	749	row_length = 1;
	750	} else if(width <= 32)
	751	{
	752	row_padding = 32;
	753	row_group = 2;
	754	row_length = 1;
	755	} else
	756	{
	757	row_padding = 64;
	758	row_group = 1;
	759	row_length = (width + 63) / 64;
	760	}
	761
	762	for(oc = 0; oc < channels; oc++)
	763	{
	764	uint8_t channel_origin = dest + oc / row_group row_length * height * 64 + oc % row_group * row_padding;
	765	for(y = 0; y < height; y++)
	766	{
	767	uint8_t y_origin = channel_origin + y row_length * 64;
	768	for(x = 0; x < width; x++)
	769	y_origin[x] = *src++;
	770	}
	771	}
	772	}
	773	#if USE_CACHED_AI_RAM
	774	static void kpu_flush_cache(uint32_t addr, size_t lines)
	775	{
	776	size_t line;
	777	for(line = 0; line < lines; line++)
	778	{
	779	const uint64_t src = (const uint64_t )(AI_RAM_BASE_ADDR + (addr + line) * 64);
	780	uint64_t dest = (uint64_t )(AI_IO_BASE_ADDR + (addr + line) * 64);
	781	size_t i;
	782	for(i = 0; i < 8; i++)
	783	dest[i] = src[i];
	784	}
	785	}
	786	#endif
	787	static int64_t kpu_carry_shift(int64_t value, uint32_t shift)
	788	{
	789	if(shift > 0)
	790	{
	791	value >>= shift - 1;
	792	if(value & 0x1)
	793	{
	794	if(value < 0)
	795	value = (value >> 1) - 1;
	796	else
	797	value = (value >> 1) + 1;
	798	} else
	799	{
	800	value >>= 1;
	801	}
	802	}
	803
	804	return value;
	805	}
	806	static void kpu_upload_core(size_t width, size_t height, size_t channels, const uint8_t *src, uint32_t kpu_addr)
	807	{
	808	uint8_t dest = (uint8_t )(uintptr_t)(AI_IO_BASE_ADDR + kpu_addr * 64);
	809	size_t oc, y, x;
	810	uint32_t row_padding;
	811	uint32_t row_group;
	812	uint32_t row_length;
	813	if(width <= 16)
	814	{
	815	row_padding = 16;
	816	row_group = 4;
	817	row_length = 1;
	818	} else if(width <= 32)
	819	{
	820	row_padding = 32;
	821	row_group = 2;
	822	row_length = 1;
	823	} else
	824	{
	825	row_padding = 64;
	826	row_group = 1;
	827	row_length = (width + 63) / 64;
	828	}
	829
	830	if((uintptr_t)src % 8 == 0 && width % 8 == 0)
	831	{
	832	#define UPLOAD_BEGIN() \
	833	for(oc = 0; oc < channels; oc++) \
	834	{ \
	835	uint8_t channel_origin = dest + oc / row_group row_length * height * 64 + oc % row_group * row_padding; \
	836	for(y = 0; y < height; y++) \
	837	{ \
	838	uint64_t y_origin = (uint64_t )(channel_origin + y * row_length * 64);
	839
	840	#define UPLOAD_END() \
	841	} \
	842	}
	843
	844	width /= 8;
	845	const uint64_t u64_src = (const uint64_t )src;
	846	if(width == 1)
	847	{
	848	UPLOAD_BEGIN()
	849	y_origin[0] = *u64_src++;
	850	UPLOAD_END()
	851	} else if(width == 2)
	852	{
	853	UPLOAD_BEGIN()
	854	{
	855	y_origin[0] = *u64_src++;
	856	y_origin[1] = *u64_src++;
	857	}
	858	UPLOAD_END()
	859	} else if(width == 4)
	860	{
	861	UPLOAD_BEGIN()
	862	{
	863	y_origin[0] = *u64_src++;
	864	y_origin[1] = *u64_src++;
	865	y_origin[2] = *u64_src++;
	866	y_origin[3] = *u64_src++;
	867	}
	868	UPLOAD_END()
	869	} else
	870	{
	871	UPLOAD_BEGIN()
	872	for(x = 0; x < width; x++)
	873	y_origin[x] = *u64_src++;
	874	UPLOAD_END()
	875	}
	876	} else
	877	{
	878	for(oc = 0; oc < channels; oc++)
	879	{
	880	uint8_t channel_origin = dest + oc / row_group row_length * height * 64 + oc % row_group * row_padding;
	881	for(y = 0; y < height; y++)
	882	{
	883	uint8_t y_origin = channel_origin + y row_length * 64;
	884	for(x = 0; x < width; x++)
	885	y_origin[x] = *src++;
	886	}
	887	}
	888	}
	889	}
	890	static void kpu_kmodel_input_with_padding(const kpu_layer_argument_t layer, const uint8_t src)
	891	{
	892	size_t width = layer->image_size.data.i_row_wid + 1;
	893	size_t height = layer->image_size.data.i_col_high + 1;
	894	size_t channels = layer->image_channel_num.data.i_ch_num + 1;
	895
	896	kpu_upload_core(width, height, channels, src, layer->image_addr.data.image_src_addr);
	897	}
	898
	899	static void kpu_kmodel_input_float(const float src, float dest, size_t count)
	900	{
	901	memcpy(dest, src, count * sizeof(float));
	902	}
	903
	904	static void kpu_float_activation(float *data, size_t count, kpu_model_activation_t act)
	905	{
	906	size_t i;
	907
	908	if(act == KLA_RELU)
	909	{
	910	for(i = 0; i < count; i++)
	911	data[i] = max(data[i], 0);
	912	} else if(act == KLA_RELU6)
	913	{
	914	for(i = 0; i < count; i++)
	915	data[i] = min(max(data[i], 0), 6);
	916	}
	917	}
	918
	919	static void kpu_kmodel_add(const kpu_model_add_layer_argument_t arg, kpu_model_context_t ctx)
	920	{
	921	const float src_a = (const float )(ctx->main_buffer + arg->main_mem_in_a_address);
	922	const float src_b = (const float )(ctx->main_buffer + arg->main_mem_in_b_address);
	923	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	924	size_t i, count = arg->count;
	925
	926	for(i = 0; i < count; i++)
	927	dest[i] = src_a[i] + src_b[i];
	928	}
	929
	930	static void kpu_quantized_add(const kpu_model_quant_add_layer_argument_t arg, kpu_model_context_t ctx)
	931	{
	932	const uint8_t src_a = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_a_address);
	933	const uint8_t src_b = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_b_address);
	934	size_t count = ALIGN_UP(arg->count, 8) / 8;
	935	int64_t off_a = arg->in_a_offset, mul_a = arg->in_a_mul, sh_a = arg->in_a_shift;
	936	int64_t off_b = arg->in_b_offset, mul_b = arg->in_b_mul, sh_b = arg->in_b_shift;
	937	int64_t off_o = arg->out_offset, mul_o = arg->out_mul, sh_o = arg->out_shift;
	938
	939	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
	940	size_t i;
	941
	942	if(sh_a == sh_b)
	943	{
	944	#define QADD_UNROLL_1(x) \
	945	int64_t a##x = *src_a++; \
	946	int64_t b##x = *src_b++;
	947
	948	#define QADD_UNROLL_2(x) \
	949	a##x += off_a; \
	950	b##x += off_b;
	951
	952	#define QADD_UNROLL_3(x) \
	953	a##x *= mul_a; \
	954	b##x *= mul_b;
	955
	956	#define QADD_UNROLL_4(x) \
	957	int64_t v##x = a##x + b##x;
	958
	959	#define QADD_UNROLL_5(x) \
	960	v##x >>= sh_a;
	961
	962	#define QADD_UNROLL_6(x) \
	963	v##x *= mul_o;
	964
	965	#define QADD_UNROLL_7(x) \
	966	v##x = kpu_carry_shift(v##x, sh_o);
	967
	968	#define QADD_UNROLL_8(x) \
	969	v##x += off_o;
	970
	971	#define QADD_UNROLL_9(x) \
	972	v##x = min(0xFF, max(0, v##x));
	973
	974	#define QADD_UNROLL_10(x) \
	975	*dest++ = v##x;
	976
	977	#define QADD_UNROLL_S(x) \
	978	QADD_UNROLL_##x(0) \
	979	QADD_UNROLL_##x(1) \
	980	QADD_UNROLL_##x(2) \
	981	QADD_UNROLL_##x(3) \
	982	QADD_UNROLL_##x(4) \
	983	QADD_UNROLL_##x(5) \
	984	QADD_UNROLL_##x(6) \
	985	QADD_UNROLL_##x(7)
	986
	987	for(i = 0; i < count; i++)
	988	{
	989	QADD_UNROLL_S(1);
	990	QADD_UNROLL_S(2);
	991	QADD_UNROLL_S(3);
	992	QADD_UNROLL_S(4);
	993	QADD_UNROLL_S(5);
	994	QADD_UNROLL_S(6);
	995	QADD_UNROLL_S(7);
	996	QADD_UNROLL_S(8);
	997	QADD_UNROLL_S(9);
	998	QADD_UNROLL_S(10);
	999	}
	1000	} else
	1001	{
	1002	#undef QADD_UNROLL_1
	1003	#define QADD_UNROLL_1(x) \
	1004	int64_t a##x = *src_a++; \
	1005	int64_t b##x = *src_b++;
	1006
	1007	#undef QADD_UNROLL_2
	1008	#define QADD_UNROLL_2(x) \
	1009	a##x += off_a; \
	1010	b##x += off_b;
	1011
	1012	#undef QADD_UNROLL_3
	1013	#define QADD_UNROLL_3(x) \
	1014	a##x *= mul_a; \
	1015	b##x *= mul_b;
	1016
	1017	#undef QADD_UNROLL_4
	1018	#define QADD_UNROLL_4(x) \
	1019	a##x >>= sh_a; \
	1020	b##x >>= sh_b;
	1021
	1022	#undef QADD_UNROLL_5
	1023	#define QADD_UNROLL_5(x) \
	1024	int64_t v##x = a##x + b##x;
	1025
	1026	#undef QADD_UNROLL_6
	1027	#define QADD_UNROLL_6(x) \
	1028	v##x *= mul_o;
	1029
	1030	#undef QADD_UNROLL_7
	1031	#define QADD_UNROLL_7(x) \
	1032	v##x = kpu_carry_shift(v##x, sh_o);
	1033
	1034	#undef QADD_UNROLL_8
	1035	#define QADD_UNROLL_8(x) \
	1036	v##x += off_o;
	1037
	1038	#undef QADD_UNROLL_9
	1039	#define QADD_UNROLL_9(x) \
	1040	v##x = min(0xFF, max(0, v##x));
	1041
	1042	#undef QADD_UNROLL_10
	1043	#define QADD_UNROLL_10(x) \
	1044	*dest++ = v##x;
	1045
	1046	#undef QADD_UNROLL_S
	1047	#define QADD_UNROLL_S(x) \
	1048	QADD_UNROLL_##x(0) \
	1049	QADD_UNROLL_##x(1) \
	1050	QADD_UNROLL_##x(2) \
	1051	QADD_UNROLL_##x(3) \
	1052	QADD_UNROLL_##x(4) \
	1053	QADD_UNROLL_##x(5) \
	1054	QADD_UNROLL_##x(6) \
	1055	QADD_UNROLL_##x(7)
	1056
	1057	for(i = 0; i < count; i++)
	1058	{
	1059	QADD_UNROLL_S(1);
	1060	QADD_UNROLL_S(2);
	1061	QADD_UNROLL_S(3);
	1062	QADD_UNROLL_S(4);
	1063	QADD_UNROLL_S(5);
	1064	QADD_UNROLL_S(6);
	1065	QADD_UNROLL_S(7);
	1066	QADD_UNROLL_S(8);
	1067	QADD_UNROLL_S(9);
	1068	QADD_UNROLL_S(10);
	1069	}
	1070	}
	1071	}
	1072
	1073	static void kpu_global_average_pool2d(const kpu_model_gap2d_layer_argument_t arg, kpu_model_context_t ctx)
	1074	{
	1075	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
	1076	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	1077	size_t oc, channels = arg->channels, kernel_size = arg->kernel_size;
	1078
	1079	for(oc = 0; oc < channels; oc++)
	1080	{
	1081	float sum = 0.f;
	1082	size_t i;
	1083	for(i = 0; i < kernel_size; i++)
	1084	sum += *src++;
	1085
	1086	dest[oc] = sum / kernel_size;
	1087	}
	1088	}
	1089
	1090	static void kpu_quantized_max_pool2d(const kpu_model_quant_max_pool2d_layer_argument_t arg, kpu_model_context_t ctx)
	1091	{
	1092	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
	1093	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
	1094	kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
	1095	uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
	1096	uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
	1097	uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
	1098
	1099	uint32_t out_y, out_x, oc;
	1100
	1101	for(oc = 0; oc < out_shape.channels; oc++)
	1102	{
	1103	const uint8_t channel_src = src + in_shape.width in_shape.height * oc;
	1104	for(out_y = 0; out_y < out_shape.height; out_y++)
	1105	{
	1106	for(out_x = 0; out_x < out_shape.width; out_x++)
	1107	{
	1108	int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
	1109	int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
	1110	int32_t kernel_x_start = max(0, -in_x_origin);
	1111	int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
	1112	int32_t kernel_y_start = max(0, -in_y_origin);
	1113	int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
	1114	uint8_t value = 0;
	1115
	1116	int32_t kernel_y, kernel_x;
	1117	for(kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
	1118	{
	1119	for(kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
	1120	{
	1121	int32_t in_x = in_x_origin + kernel_x;
	1122	int32_t in_y = in_y_origin + kernel_y;
	1123	value = max(value, channel_src[in_y * in_shape.width + in_x]);
	1124	}
	1125	}
	1126
	1127	*dest++ = value;
	1128	}
	1129	}
	1130	}
	1131	}
	1132
	1133	static void kpu_average_pool2d(const kpu_model_ave_pool2d_layer_argument_t arg, kpu_model_context_t ctx)
	1134	{
	1135	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
	1136	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	1137	kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
	1138	uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
	1139	uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
	1140	uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
	1141
	1142	uint32_t out_y, out_x, oc;
	1143
	1144	for(oc = 0; oc < out_shape.channels; oc++)
	1145	{
	1146	const float channel_src = src + in_shape.width in_shape.height * oc;
	1147	for(out_y = 0; out_y < out_shape.height; out_y++)
	1148	{
	1149	for(out_x = 0; out_x < out_shape.width; out_x++)
	1150	{
	1151	int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
	1152	int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
	1153	int32_t kernel_x_start = max(0, -in_x_origin);
	1154	int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
	1155	int32_t kernel_y_start = max(0, -in_y_origin);
	1156	int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
	1157	float value = 0;
	1158	float kernel_count = 0;
	1159
	1160	int32_t kernel_y, kernel_x;
	1161	for(kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
	1162	{
	1163	for(kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
	1164	{
	1165	int32_t in_x = in_x_origin + kernel_x;
	1166	int32_t in_y = in_y_origin + kernel_y;
	1167	value += channel_src[in_y * in_shape.width + in_x];
	1168	kernel_count++;
	1169	}
	1170	}
	1171
	1172	*dest++ = value / kernel_count;
	1173	}
	1174	}
	1175	}
	1176	}
	1177
	1178	static void kpu_quantize(const kpu_model_quantize_layer_argument_t arg, kpu_model_context_t ctx)
	1179	{
	1180	size_t count = arg->count;
	1181	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
	1182
	1183	kpu_model_quant_param_t q = arg->quant_param;
	1184
	1185	float scale = 1.f / q.scale;
	1186
	1187	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->mem_out_address);
	1188	size_t i;
	1189	for(i = 0; i < count; i++)
	1190	{
	1191	int value = roundf((src++ - q.bias) scale);
	1192	if(value < 0)
	1193	value = 0;
	1194	if(value > 0xFF)
	1195	value = 0xFF;
	1196	*dest++ = (uint8_t)value;
	1197	}
	1198	}
	1199
	1200	static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t arg, kpu_model_context_t ctx)
	1201	{
	1202	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
	1203	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	1204	size_t oc, count = arg->count;
	1205	kpu_model_quant_param_t q = arg->quant_param;
	1206
	1207	for(oc = 0; oc < count; oc++)
	1208	dest[oc] = src++ q.scale + q.bias;
	1209	}
	1210
	1211	static void kpu_kmodel_channelwise_dequantize(const kpu_model_channelwise_dequant_argument_t arg, kpu_model_context_t ctx)
	1212	{
	1213	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
	1214	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	1215	size_t oc, i, channels = arg->channels, count = arg->channel_size;
	1216
	1217	for(oc = 0; oc < channels; oc++)
	1218	{
	1219	const kpu_model_quant_param_t q = arg->quant_params[oc];
	1220
	1221	for(i = 0; i < count; i++)
	1222	dest++ = src++ * q.scale + q.bias;
	1223	}
	1224	}
	1225
	1226	static void kpu_requantize(const kpu_model_requantize_layer_argument_t arg, kpu_model_context_t ctx)
	1227	{
	1228	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
	1229	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
	1230	size_t oc, count = arg->count;
	1231	const uint8_t *table = arg->table;
	1232
	1233	if(false && count % 8 == 0)
	1234	{
	1235	for(oc = 0; oc < count;)
	1236	{
	1237	dest[oc++] = table[*src++];
	1238	dest[oc++] = table[*src++];
	1239	dest[oc++] = table[*src++];
	1240	dest[oc++] = table[*src++];
	1241	dest[oc++] = table[*src++];
	1242	dest[oc++] = table[*src++];
	1243	dest[oc++] = table[*src++];
	1244	dest[oc++] = table[*src++];
	1245	}
	1246	} else
	1247	{
	1248	for(oc = 0; oc < count; oc++)
	1249	dest[oc] = table[src[oc]];
	1250	}
	1251	}
	1252
	1253	static void kpu_l2_normalization(const kpu_model_l2_norm_layer_argument_t arg, kpu_model_context_t ctx)
	1254	{
	1255	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
	1256	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	1257	size_t oc, channels = arg->channels;
	1258
	1259	float sum = 0.f;
	1260	const float epsilon = 1e-10f;
	1261	for(oc = 0; oc < channels; oc++)
	1262	sum += src[oc] * src[oc];
	1263	if(sum < epsilon)
	1264	sum = epsilon;
	1265	sum = 1.f / sqrtf(sum);
	1266	for(oc = 0; oc < channels; oc++)
	1267	dest[oc] = src[oc] * sum;
	1268	}
	1269
	1270	static void kpu_softmax(const kpu_model_softmax_layer_argument_t arg, kpu_model_context_t ctx)
	1271	{
	1272	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
	1273	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	1274	size_t oc, channels = arg->channels;
	1275
	1276	float max = FLT_MIN;
	1277	for(oc = 0; oc < channels; oc++)
	1278	max = fmaxf(max, src[oc]);
	1279
	1280	float sum = 0.f;
	1281	for(oc = 0; oc < channels; oc++)
	1282	{
	1283	float value = expf(src[oc] - max);
	1284	sum += value;
	1285	dest[oc] = value;
	1286	}
	1287
	1288	for(oc = 0; oc < channels; oc++)
	1289	dest[oc] /= sum;
	1290	}
	1291
	1292	static void kpu_concat(const kpu_model_concat_layer_argument_t arg, kpu_model_context_t ctx)
	1293	{
	1294	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
	1295	uint32_t count = arg->input_count, i;
	1296
	1297	for(i = 0; i < count; i++)
	1298	{
	1299	kpu_model_memory_range_t input = arg->inputs_mem[i];
	1300	const uint8_t src = (const uint8_t )(ctx->main_buffer + input.start);
	1301	memcpy(dest, src, input.size);
	1302	dest += input.size;
	1303	}
	1304	}
	1305
	1306	static void kpu_kmodel_fully_connected(const kpu_model_fully_connected_layer_argument_t arg, kpu_model_context_t ctx)
	1307	{
	1308	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
	1309	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	1310	uint32_t in_channels = arg->in_channels, out_channels = arg->out_channels, ic, oc;
	1311	float weights = (float )malloc(in_channels * out_channels * sizeof(float));
	1312	float bias = (float )malloc(out_channels * sizeof(float));
	1313	memcpy(weights, arg->weights, out_channels * in_channels * sizeof(float));
	1314	memcpy(bias, arg->weights + in_channels * out_channels, out_channels * sizeof(float));
	1315
	1316	if(in_channels % 8 == 0)
	1317	{
	1318	#define FC_UNROLL_1(x) \
	1319	float i##x = *c_src++; \
	1320	float w##x = *c_weights++;
	1321
	1322	#define FC_UNROLL_2(x) \
	1323	sum += i##x * w##x;
	1324
	1325	#define FC_UNROLL_S(x) \
	1326	FC_UNROLL_##x(0) \
	1327	FC_UNROLL_##x(1) \
	1328	FC_UNROLL_##x(2) \
	1329	FC_UNROLL_##x(3) \
	1330	FC_UNROLL_##x(4) \
	1331	FC_UNROLL_##x(5) \
	1332	FC_UNROLL_##x(6) \
	1333	FC_UNROLL_##x(7)
	1334
	1335	for(oc = 0; oc < out_channels; oc++)
	1336	{
	1337	const float *c_src = src;
	1338	const float c_weights = weights + oc in_channels;
	1339
	1340	float sum = 0.0f;
	1341	for(ic = 0; ic < in_channels / 8; ic++)
	1342	{
	1343	FC_UNROLL_S(1);
	1344	FC_UNROLL_S(2);
	1345	}
	1346
	1347	dest[oc] = sum + bias[oc];
	1348	}
	1349	} else
	1350	{
	1351	for(oc = 0; oc < out_channels; oc++)
	1352	{
	1353	const float c_weights = weights + oc in_channels;
	1354
	1355	float sum = 0.0f;
	1356	for(ic = 0; ic < in_channels; ic++)
	1357	sum += src[ic] * c_weights[ic];
	1358	dest[oc] = sum + bias[oc];
	1359	}
	1360	}
	1361	free(weights);
	1362	free(bias);
	1363	kpu_float_activation(dest, out_channels, arg->act);
	1364	}
	1365
	1366	static void kpu_tf_flatten(const kpu_model_tf_flatten_layer_argument_t arg, kpu_model_context_t ctx)
	1367	{
	1368	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
	1369	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	1370	kpu_model_shape_t in_shape = arg->shape;
	1371	uint32_t oc, oy, ox;
	1372
	1373	for(oy = 0; oy < in_shape.height; oy++)
	1374	for(ox = 0; ox < in_shape.width; ox++)
	1375	for(oc = 0; oc < in_shape.channels; oc++)
	1376	dest++ = src[(oc in_shape.height + oy) * in_shape.width + ox];
	1377	}
	1378
	1379	static void kpu_resize_nearest_neighbor(const kpu_model_resize_nearest_neighbor_layer_argument_t arg, kpu_model_context_t ctx)
	1380	{
	1381	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
	1382	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	1383	kpu_model_shape_t in_shape = arg->in_shape;
	1384	uint32_t out_width = arg->out_width, out_height = arg->out_height;
	1385	uint32_t oc, oy, ox;
	1386
	1387	float height_scale = (float)in_shape.height / out_height;
	1388	float width_scale = (float)in_shape.width / out_width;
	1389
	1390	for(oc = 0; oc < in_shape.channels; oc++)
	1391	{
	1392	const float channel_src = src + in_shape.width in_shape.height * oc;
	1393	for(oy = 0; oy < out_height; oy++)
	1394	{
	1395	uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
	1396	const float y_origin = channel_src + in_y in_shape.width;
	1397	for(ox = 0; ox < out_width; ox++)
	1398	{
	1399	uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
	1400	*dest++ = y_origin[in_x];
	1401	}
	1402	}
	1403	}
	1404	}
	1405
	1406	static void kpu_quant_resize_nearest_neighbor(const kpu_model_quant_resize_nearest_neighbor_layer_argument_t arg, kpu_model_context_t ctx)
	1407	{
	1408	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
	1409	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
	1410	kpu_model_shape_t in_shape = arg->in_shape;
	1411	uint32_t out_width = arg->out_width, out_height = arg->out_height;
	1412	uint32_t oc, oy, ox;
	1413
	1414	float height_scale = (float)in_shape.height / out_height;
	1415	float width_scale = (float)in_shape.width / out_width;
	1416
	1417	for(oc = 0; oc < in_shape.channels; oc++)
	1418	{
	1419	const uint8_t channel_src = src + in_shape.width in_shape.height * oc;
	1420	for(oy = 0; oy < out_height; oy++)
	1421	{
	1422	uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
	1423	const uint8_t y_origin = channel_src + in_y in_shape.width;
	1424	for(ox = 0; ox < out_width; ox++)
	1425	{
	1426	uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
	1427	*dest++ = y_origin[in_x];
	1428	}
	1429	}
	1430	}
	1431	}
	1432
	1433	static void kpu_logistic(const kpu_model_logistic_layer_argument_t arg, kpu_model_context_t ctx)
	1434	{
	1435	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
	1436	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	1437	size_t oc, channels = arg->channels;
	1438
	1439	for(oc = 0; oc < channels; oc++)
	1440	dest[oc] = 1.f / (1.f + expf(-src[oc]));
	1441	}
	1442
	1443	static void kpu_conv(const kpu_model_conv_layer_argument_t arg, kpu_model_context_t ctx)
	1444	{
	1445	volatile kpu_layer_argument_t layer = (const volatile kpu_layer_argument_t )(ctx->model_buffer + arg->layer_offset);
	1446	layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - IOMEM;
	1447	layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - IOMEM;
	1448	layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - IOMEM;
	1449
	1450	if(arg->flags & KLF_MAIN_MEM_OUT)
	1451	{
	1452	dmac_channel_number_t dma_ch = ctx->dma_ch;
	1453	uint8_t *dest = ctx->main_buffer + arg->main_mem_out_address;
	1454	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
	1455	.calc_done_int = 1,
	1456	.layer_cfg_almost_empty_int = 1,
	1457	.layer_cfg_almost_full_int = 1};
	1458	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
	1459	.calc_done_int = 1,
	1460	.layer_cfg_almost_empty_int = 1,
	1461	.layer_cfg_almost_full_int = 1};
	1462	layer.dma_parameter.data.send_data_out = 1;
	1463	select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
	1464	if(ctx->current_layer < ctx->layers_length)
	1465	dmac_set_irq(dma_ch, ai_step, ctx, 1);
	1466	else
	1467	dmac_set_irq(dma_ch, (plic_irq_callback_t)kpu_kmodel_done, ctx, 1);
	1468	dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
	1469	DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8);
	1470	} else
	1471	{
	1472	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
	1473	.calc_done_int = 1,
	1474	.layer_cfg_almost_empty_int = 1,
	1475	.layer_cfg_almost_full_int = 1};
	1476
	1477	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
	1478	.calc_done_int = 0,
	1479	.layer_cfg_almost_empty_int = 1,
	1480	.layer_cfg_almost_full_int = 1};
	1481	layer.interrupt_enabe.data.int_en = 1;
	1482	}
	1483
	1484	kpu_send_layer((const kpu_layer_argument_t *)&layer);
	1485	}
	1486
	1487	static void kpu_add_padding(const kpu_model_add_padding_layer_argument_t arg, kpu_model_context_t ctx)
	1488	{
	1489	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
	1490	#if USE_CACHED_AI_RAM
	1491	uint8_t dest = (uint8_t )(uintptr_t)(AI_RAM_BASE_ADDR + arg->kpu_mem_out_address * 64);
	1492	#else
	1493	uint8_t dest = (uint8_t )(uintptr_t)(AI_IO_BASE_ADDR + arg->kpu_mem_out_address * 64);
	1494	#endif
	1495
	1496	uint32_t row_padding = 16;
	1497	uint32_t row_group = 4;
	1498	uint32_t row_length = 1;
	1499	uint32_t height = 4;
	1500	uint32_t oc, x, y, channels = arg->channels;
	1501
	1502	for(oc = 0; oc < channels; oc++)
	1503	{
	1504	uint8_t channel_origin = dest + oc / row_group row_length * height * 64 + oc % row_group * row_padding;
	1505	for(y = 0; y < 1; y++)
	1506	{
	1507	uint8_t y_origin = channel_origin + y row_length * 64;
	1508	for(x = 0; x < 1; x++)
	1509	y_origin[x] = *src++;
	1510	}
	1511	}
	1512
	1513	#if USE_CACHED_AI_RAM
	1514	uint32_t lines = row_length * height * channels / row_group;
	1515	kpu_flush_cache(arg->kpu_mem_out_address, lines);
	1516	#endif
	1517	}
	1518
	1519	static void kpu_remove_padding(const kpu_model_remove_padding_layer_argument_t arg, kpu_model_context_t ctx)
	1520	{
	1521	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
	1522	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
	1523	uint32_t oc, channels = arg->channels;
	1524
	1525	for(oc = 0; oc < channels; oc++)
	1526	dest++ = src[oc 16];
	1527	}
	1528
	1529	static void kpu_upload(const kpu_model_upload_layer_argument_t arg, kpu_model_context_t ctx)
	1530	{
	1531	size_t width = arg->width;
	1532	size_t height = arg->height;
	1533	size_t channels = arg->channels;
	1534
	1535	kpu_upload_core(width, height, channels, ctx->main_buffer + arg->main_mem_in_address, arg->kpu_mem_out_address);
	1536	}
	1537
	1538	int kpu_load_kmodel(kpu_model_context_t ctx, const uint8_t buffer)
	1539	{
	1540	#if FIX_CACHE
	1541	configASSERT(is_memory_cache((uintptr_t)buffer));
	1542	#endif
	1543	uintptr_t base_addr = (uintptr_t)buffer;
	1544	const kpu_kmodel_header_t header = (const kpu_kmodel_header_t )buffer;
	1545
	1546	if (header->version == 3 && header->arch == 0)
	1547	{
	1548	ctx->is_nncase = 0;
	1549	ctx->model_buffer = buffer;
	1550	ctx->output_count = header->output_count;
	1551	ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t));
	1552	ctx->layer_headers = (const kpu_model_layer_header_t )((uintptr_t)ctx->outputs + sizeof(kpu_model_output_t) ctx->output_count);
	1553	ctx->layers_length = header->layers_length;
	1554	ctx->body_start = (const uint8_t )((uintptr_t)ctx->layer_headers + sizeof(kpu_model_layer_header_t) header->layers_length);
	1555	ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage);
	1556	if (!ctx->main_buffer)
	1557	return -1;
	1558	uint32_t body_size = 0;
	1559	for (int i=0; i<ctx->layers_length; i++)
	1560	{
	1561	const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + i;
	1562	body_size += cnt_layer_header->body_size;
	1563	}
	1564	uint8_t body_start_iomem = (uint8_t )((uintptr_t)ctx->body_start - IOMEM);
	1565	const uint8_t *body_start_cache = ctx->body_start;
	1566	memcpy(body_start_iomem, body_start_cache, body_size);
	1567	for (int i=0; i<body_size; i++)
	1568	{
	1569	configASSERT(body_start_iomem[i] == body_start_cache[i]);
	1570	}
	1571
	1572	} else
	1573	{
	1574	return -1;
	1575	}
	1576
	1577	return 0;
	1578	}
	1579
	1580	int kpu_get_output(kpu_model_context_t ctx, uint32_t index, uint8_t data, size_t size)
	1581	{
	1582	if(ctx->is_nncase)
	1583	return -1;
	1584
	1585	if(index >= ctx->output_count)
	1586	return -1;
	1587
	1588	const kpu_model_output_t *output = ctx->outputs + index;
	1589	*data = ctx->main_buffer + output->address;
	1590	*size = output->size;
	1591	return 0;
	1592	}
	1593
	1594	void kpu_model_free(kpu_model_context_t *ctx)
	1595	{
	1596	if(ctx->is_nncase)
	1597	return;
	1598
	1599	free(ctx->main_buffer);
	1600	ctx->main_buffer = NULL;
	1601	}
	1602
	1603	#if KPU_DEBUG
	1604	static uint64_t last_time;
	1605	static uint64_t total_time;
	1606	static uint64_t kpu_time;
	1607	static uint32_t last_layer_type;
	1608
	1609	static const char *str_layer_type(uint32_t type)
	1610	{
	1611	switch(type)
	1612	{
	1613	case KL_ADD:
	1614	return "Add";
	1615	case KL_QUANTIZED_ADD:
	1616	return "QuantAdd";
	1617	case KL_GLOBAL_AVERAGE_POOL2D:
	1618	return "GAP";
	1619	case KL_QUANTIZED_MAX_POOL2D:
	1620	return "QuantMaxPool2d";
	1621	case KL_AVERAGE_POOL2D:
	1622	return "AveragePool2d";
	1623	case KL_QUANTIZE:
	1624	return "Quantize";
	1625	case KL_DEQUANTIZE:
	1626	return "Dequantize";
	1627	case KL_REQUANTIZE:
	1628	return "Requantize";
	1629	case KL_L2_NORMALIZATION:
	1630	return "L2Norm";
	1631	case KL_SOFTMAX:
	1632	return "Softmax";
	1633	case KL_CONCAT:
	1634	return "Concat";
	1635	case KL_QUANTIZED_CONCAT:
	1636	return "QuantConcat";
	1637	case KL_FULLY_CONNECTED:
	1638	return "FullyConnected";
	1639	case KL_TENSORFLOW_FLATTEN:
	1640	return "TFFlatten";
	1641	case KL_RESIZE_NEAREST_NEIGHBOR:
	1642	return "ResizeNearestNeighbor";
	1643	case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
	1644	return "QuantResizeNearestNeighbor";
	1645	case KL_CHANNELWISE_DEQUANTIZE:
	1646	return "ChannelwiseDequantize";
	1647	case KL_LOGISTIC:
	1648	return "Logistic";
	1649	case KL_K210_CONV:
	1650	return "K210Conv";
	1651	case KL_K210_ADD_PADDING:
	1652	return "K210AddPad";
	1653	case KL_K210_REMOVE_PADDING:
	1654	return "K210RemovePad";
	1655	case KL_K210_UPLOAD:
	1656	return "K210Upload";
	1657	default:
	1658	return "Unknown";
	1659	}
	1660	}
	1661	#endif
	1662
	1663	static int kpu_kmodel_done(kpu_model_context_t *ctx)
	1664	{
	1665	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
	1666	.calc_done_int = 1,
	1667	.layer_cfg_almost_empty_int = 1,
	1668	.layer_cfg_almost_full_int = 1};
	1669	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
	1670	.calc_done_int = 1,
	1671	.layer_cfg_almost_empty_int = 1,
	1672	.layer_cfg_almost_full_int = 1};
	1673	#if KPU_DEBUG
	1674	uint32_t cnt_layer_id = ctx->current_layer - 1;
	1675	uint64_t time = sysctl_get_time_us();
	1676	if(last_time != 0)
	1677	{
	1678	uint64_t layer_time = time - last_time;
	1679	syslog(LOG_NOTICE, "layer %d [%s]: %f ms", cnt_layer_id, str_layer_type(last_layer_type), layer_time / 1000.0);
	1680	total_time += layer_time;
	1681	if(last_layer_type == KL_K210_CONV)
	1682	kpu_time += layer_time;
	1683	}
	1684
	1685	syslog(LOG_NOTICE, "KPU: %f ms", kpu_time / 1000.0);
	1686	syslog(LOG_NOTICE, "CPU: %f ms", (total_time - kpu_time) / 1000.0);
	1687	syslog(LOG_NOTICE, "Model: %f ms", total_time / 1000.0);
	1688	#endif
	1689	ctx->done_callback(ctx->userdata);
	1690	return 0;
	1691	}
	1692
	1693	static int ai_step(void *userdata)
	1694	{
	1695	kpu_model_context_t ctx = (kpu_model_context_t )userdata;
	1696
	1697	uint32_t cnt_layer_id = ctx->current_layer;
	1698	const uint8_t *layer_body = ctx->current_body;
	1699	const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + cnt_layer_id;
	1700	if (cnt_layer_id >= ctx->layers_length) {
	1701	//syslog(LOG_NOTICE, "overrun");
	1702	kpu_kmodel_done(ctx);
	1703	return -1;
	1704	}
	1705
	1706	ctx->current_layer++;
	1707	ctx->current_body += cnt_layer_header->body_size;
	1708
	1709	#if KPU_DEBUG
	1710	uint64_t time = sysctl_get_time_us();
	1711	if(last_time != 0)
	1712	{
	1713	uint64_t layer_time = time - last_time;
	1714	syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000);
	1715	total_time += layer_time;
	1716	if(last_layer_type == KL_K210_CONV)
	1717	kpu_time += layer_time;
	1718	}
	1719
	1720	last_layer_type = cnt_layer_header->type;
	1721	last_time = sysctl_get_time_us();
	1722	#endif
	1723
	1724	switch(cnt_layer_header->type)
	1725	{
	1726	case KL_ADD:
	1727	kpu_kmodel_add((const kpu_model_add_layer_argument_t *)layer_body, ctx);
	1728	break;
	1729	case KL_QUANTIZED_ADD:
	1730	kpu_quantized_add((const kpu_model_quant_add_layer_argument_t *)layer_body, ctx);
	1731	break;
	1732	case KL_GLOBAL_AVERAGE_POOL2D:
	1733	kpu_global_average_pool2d((const kpu_model_gap2d_layer_argument_t *)layer_body, ctx);
	1734	break;
	1735	case KL_QUANTIZED_MAX_POOL2D:
	1736	kpu_quantized_max_pool2d((const kpu_model_quant_max_pool2d_layer_argument_t *)layer_body, ctx);
	1737	break;
	1738	case KL_AVERAGE_POOL2D:
	1739	kpu_average_pool2d((const kpu_model_ave_pool2d_layer_argument_t *)layer_body, ctx);
	1740	break;
	1741	case KL_QUANTIZE:
	1742	kpu_quantize((const kpu_model_quantize_layer_argument_t *)layer_body, ctx);
	1743	break;
	1744	case KL_DEQUANTIZE:
	1745	kpu_kmodel_dequantize((const kpu_model_dequantize_layer_argument_t *)layer_body, ctx);
	1746	break;
	1747	case KL_REQUANTIZE:
	1748	kpu_requantize((const kpu_model_requantize_layer_argument_t *)layer_body, ctx);
	1749	break;
	1750	case KL_L2_NORMALIZATION:
	1751	kpu_l2_normalization((const kpu_model_l2_norm_layer_argument_t *)layer_body, ctx);
	1752	break;
	1753	case KL_SOFTMAX:
	1754	kpu_softmax((const kpu_model_softmax_layer_argument_t *)layer_body, ctx);
	1755	break;
	1756	case KL_CONCAT:
	1757	case KL_QUANTIZED_CONCAT:
	1758	kpu_concat((const kpu_model_concat_layer_argument_t *)layer_body, ctx);
	1759	break;
	1760	case KL_FULLY_CONNECTED:
	1761	kpu_kmodel_fully_connected((const kpu_model_fully_connected_layer_argument_t *)layer_body, ctx);
	1762	break;
	1763	case KL_TENSORFLOW_FLATTEN:
	1764	kpu_tf_flatten((const kpu_model_tf_flatten_layer_argument_t *)layer_body, ctx);
	1765	break;
	1766	case KL_RESIZE_NEAREST_NEIGHBOR:
	1767	kpu_resize_nearest_neighbor((const kpu_model_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
	1768	break;
	1769	case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
	1770	kpu_quant_resize_nearest_neighbor((const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
	1771	break;
	1772	case KL_CHANNELWISE_DEQUANTIZE:
	1773	kpu_kmodel_channelwise_dequantize((const kpu_model_channelwise_dequant_argument_t *)layer_body, ctx);
	1774	break;
	1775	case KL_LOGISTIC:
	1776	kpu_logistic((const kpu_model_logistic_layer_argument_t *)layer_body, ctx);
	1777	break;
	1778	case KL_K210_CONV:
	1779	kpu_conv((const kpu_model_conv_layer_argument_t *)layer_body, ctx);
	1780	return 0;
	1781	case KL_K210_ADD_PADDING:
	1782	kpu_add_padding((const kpu_model_add_padding_layer_argument_t *)layer_body, ctx);
	1783	break;
	1784	case KL_K210_REMOVE_PADDING:
	1785	kpu_remove_padding((const kpu_model_remove_padding_layer_argument_t *)layer_body, ctx);
	1786	break;
	1787	case KL_K210_UPLOAD:
	1788	kpu_upload((const kpu_model_upload_layer_argument_t *)layer_body, ctx);
	1789	break;
	1790	default:
	1791	assert(!"Layer is not supported.");
	1792	kpu_kmodel_done(ctx);
	1793	return -1;
	1794	}
	1795
	1796	if (ctx->current_layer < ctx->layers_length)
	1797	ai_step(userdata);
	1798	else
	1799	kpu_kmodel_done(ctx);
	1800	return 0;
	1801	}
	1802
	1803	static void ai_step_not_isr(void *userdata)
	1804	{
	1805	sysctl_disable_irq();
	1806	ai_step(userdata);
	1807	sysctl_enable_irq();
	1808	}
	1809
	1810	int kpu_run_kmodel(kpu_model_context_t ctx, const uint8_t src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
	1811	{
	1812	if(ctx->is_nncase)
	1813	return -1;
	1814
	1815	ctx->dma_ch = dma_ch;
	1816	ctx->done_callback = done_callback;
	1817	ctx->userdata = userdata;
	1818	ctx->current_layer = 0;
	1819	ctx->current_body = ctx->body_start;
	1820	#if KPU_DEBUG
	1821	last_time = 0;
	1822	total_time = 0;
	1823	kpu_time = 0;
	1824	#endif
	1825
	1826	kpu_kmodel_header_t header = (kpu_kmodel_header_t )ctx->model_buffer;
	1827	kpu->interrupt_clear.reg = 7;
	1828	kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
	1829	.fifo_full_threshold = 10, .fifo_empty_threshold = 1};
	1830	kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
	1831	.eight_bit_mode = header->flags & 1};
	1832	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
	1833	.calc_done_int = 1,
	1834	.layer_cfg_almost_empty_int = 0,
	1835	.layer_cfg_almost_full_int = 1};
	1836
	1837	plic_set_priority(INTNO_AI, 1);
	1838	plic_irq_register(INTNO_AI, ai_step, ctx);
	1839	plic_irq_enable(INTNO_AI);
	1840
	1841	const kpu_model_layer_header_t *first_layer_header = ctx->layer_headers;
	1842
	1843	switch(first_layer_header->type)
	1844	{
	1845	case KL_K210_CONV:
	1846	{
	1847	const kpu_model_conv_layer_argument_t first_layer = (const kpu_model_conv_layer_argument_t )ctx->body_start;
	1848	kpu_layer_argument_t layer_arg = (volatile kpu_layer_argument_t )(ctx->model_buffer + first_layer->layer_offset);
	1849
	1850	if((layer_arg.image_size.data.i_row_wid + 1) % 64 != 0)
	1851	{
	1852	kpu_kmodel_input_with_padding(&layer_arg, src);
	1853	ai_step_not_isr(ctx);
	1854	} else
	1855	{
	1856	kpu_input_dma(&layer_arg, src, ctx->dma_ch, ai_step, ctx);
	1857	}
	1858	}
	1859	break;
	1860	case KL_FULLY_CONNECTED:
	1861	{
	1862	const kpu_model_fully_connected_layer_argument_t first_layer = (const kpu_model_fully_connected_layer_argument_t )ctx->body_start;
	1863	kpu_kmodel_input_float((const float )src, (float )(ctx->main_buffer + first_layer->main_mem_in_address), first_layer->in_channels);
	1864	ai_step_not_isr(ctx);
	1865	}
	1866	break;
	1867	default:
	1868	return -1;
	1869	}
	1870
	1871	return 0;
	1872	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: