Context Navigation

kpu.c@ 458

Last change on this file since 458 was 458, checked in by coas-nagasima, 4 years ago
SPIとSerial、KPUの動作を改善
Property svn:eol-style set to `native` Property svn:mime-type set to `text/x-csrc;charset=UTF-8`
File size: 42.6 KB

Rev	Line
[453]	1	#include <assert.h>
	2	#include <float.h>
	3	#include <math.h>
	4	#include <stdio.h>
	5	#include <stdlib.h>
	6	#include <string.h>
	7	#include <stdint.h>
	8	#include <kernel.h>
	9	#include <t_syslog.h>
	10	#include <t_stdlib.h>
	11	#include <kernel_impl.h>
	12	#include <target_syssvc.h>
	13	#include "kendryte-k210.h"
	14	#include "device.h"
	15	#include "atomic.h"
	16	#include "kpu.h"
	17	#include "utils.h"
	18	#include "kpu_main.h"
[458]	19	#include "kernel_cfg.h"
[453]	20
	21	#define sil_orw_mem(a, b) sil_wrw_mem((a), sil_rew_mem(a) \| (b))
	22
	23	uint64_t sysctl_get_time_us(void)
	24	{
[458]	25	uint64_t v_cycle = read_cycle();
	26	return v_cycle * 1000000 / SYSCTRL_CLOCK_FREQ_IN0;
[453]	27	}
	28
	29	static int is_memory(uintptr_t address)
	30	{
[458]	31	enum
	32	{
	33	mem_len = 6 * 1024 * 1024,
	34	mem_no_cache_len = 8 * 1024 * 1024,
	35	};
	36	return ((address >= 0x80000000) && (address < 0x80000000 + mem_len)) \|\| ((address >= 0x40000000) && (address < 0x40000000 + mem_no_cache_len)) \|\| (address == 0x50450040);
[453]	37	}
	38
	39	uint32_t is_memory_cache(uintptr_t address)
	40	{
[458]	41	#define MEM_CACHE_LEN (6 * 1024 * 1024)
[453]	42
[458]	43	return ((address >= 0x80000000) && (address < 0x80000000 + MEM_CACHE_LEN));
[453]	44	}
	45
	46	int plic_irq_enable(INTNO irq_number)
	47	{
[458]	48	if (irq_number != INTNO_AI)
	49	return -1;
	50	ena_int(irq_number);
	51	return 0;
[453]	52	}
	53
	54	int plic_set_priority(INTNO irq_number, uint32_t priority)
	55	{
[458]	56	if (irq_number != INTNO_AI)
	57	return -1;
	58	set_ipriority(irq_number, priority);
	59	return 0;
[453]	60	}
	61
	62	plic_irq_callback_t ai_done_callback;
	63	void *ai_done_ctx;
	64
	65	void plic_irq_register(INTNO irq, plic_irq_callback_t callback, void *ctx)
	66	{
[458]	67	if (irq != INTNO_AI)
	68	return;
[453]	69
[458]	70	dis_int(INTNO_AI);
[453]	71
[458]	72	ai_done_callback = callback;
	73	ai_done_ctx = ctx;
[453]	74
[458]	75	ena_int(INTNO_AI);
[453]	76	}
	77
	78	void ai_done_isr(intptr_t exinf)
	79	{
[458]	80	dis_int(INTNO_AI);
	81	if (ai_done_callback != NULL)
	82	{
	83	ai_done_callback(ai_done_ctx);
	84	}
	85	ena_int(INTNO_AI);
[453]	86	}
	87
	88	plic_irq_callback_t ai_dma_done_callback;
	89	void *ai_dma_done_ctx;
	90
	91	void kpu_dmac_irq_register(dmac_channel_number_t channel_num,
[458]	92	plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
[453]	93	{
[458]	94	if (channel_num != AI_DMA_CH)
	95	return;
[453]	96
[458]	97	//set_ipriority(INTNO_DMAAI, priority);
[453]	98
[458]	99	dis_int(INTNO_DMAAI);
[453]	100
[458]	101	ai_dma_done_callback = dmac_callback;
	102	ai_dma_done_ctx = ctx;
[453]	103
[458]	104	ena_int(INTNO_DMAAI);
[453]	105	}
	106
	107	void ai_dma_done_isr(DMA_Handle_t *dma)
	108	{
[458]	109	dis_int(INTNO_DMAAI);
	110
	111	if (ai_dma_done_callback != NULL)
	112	{
	113	ai_dma_done_callback(ai_dma_done_ctx);
	114	}
	115
	116	ena_int(INTNO_DMAAI);
[453]	117	}
	118
	119	void dmac_set_irq(dmac_channel_number_t channel_num,
[458]	120	plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
[453]	121	{
[458]	122	if (channel_num != AI_DMA_CH)
	123	return;
[453]	124
[458]	125	//set_ipriority(INTNO_DMAAI, priority);
[453]	126
[458]	127	dis_int(INTNO_DMAAI);
[453]	128
[458]	129	ai_dma_done_callback = dmac_callback;
	130	ai_dma_done_ctx = ctx;
[453]	131
[458]	132	ena_int(INTNO_DMAAI);
[453]	133	}
	134
	135	DMA_Handle_t g_ai_hdma;
	136
	137	void dmac_set_single_mode(dmac_channel_number_t channel_num,
[458]	138	const void src, void dest, uint8_t src_inc,
	139	uint8_t dest_inc,
	140	uint8_t dmac_burst_size,
	141	uint8_t dmac_trans_width,
	142	size_t block_size)
[453]	143	{
[458]	144	if (channel_num != AI_DMA_CH)
	145	return;
[453]	146
[458]	147	DMA_Handle_t *hdma = &g_ai_hdma;
	148	int mem_type_src = is_memory((uintptr_t)src), mem_type_dest = is_memory((uintptr_t)dest);
	149	uint8_t flow_control;
	150	if (mem_type_src == 0 && mem_type_dest == 0)
	151	flow_control = DMA_PERIPH_TO_PERIPH;
	152	else if (mem_type_src == 1 && mem_type_dest == 0)
	153	flow_control = DMA_MEMORY_TO_PERIPH;
	154	else if (mem_type_src == 0 && mem_type_dest == 1)
	155	flow_control = DMA_PERIPH_TO_MEMORY;
	156	else
	157	flow_control = DMA_MEMORY_TO_MEMORY;
[453]	158
[458]	159	hdma->Init.Direction = flow_control; /* DMA転送方向 */
	160	hdma->Init.SrcHandShake = (mem_type_src ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* ソースハンドシェイク */
	161	hdma->Init.DrcHandShake = (mem_type_dest ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* デスティネーションハンドシェイク */
	162	hdma->Init.SrcInc = src_inc; /* ソースインクリメント設定 */
	163	hdma->Init.DstInc = dest_inc; /* デスティネーションインクリメント設定 */
	164	hdma->Init.SrcTransWidth = dmac_trans_width; /* ソース転送幅 */
	165	hdma->Init.DstTransWidth = dmac_trans_width; /* デスティネーション転送幅 */
	166	hdma->Init.SrcBurstSize = dmac_burst_size; /* ソースバーストサイズ */
	167	hdma->Init.DstBurstSize = dmac_burst_size; /* デスティネーションバーストサイズ */
	168	dma_reset(hdma);
[453]	169
[458]	170	dma_start(hdma, (uintptr_t)src, (uintptr_t)dest, block_size);
[453]	171	}
	172
	173	#define LAYER_BURST_SIZE 12
	174
	175	#define KPU_DEBUG 0
	176	#define USE_CACHED_AI_RAM 0
	177
	178	#define min(a, b) (((a) < (b)) ? (a) : (b))
	179	#define max(a, b) (((a) > (b)) ? (a) : (b))
	180	#define ALIGN_UP(x, align) ((x + (align - 1)) & (~(align - 1)))
	181
	182	static int ai_step(void *userdata);
	183	static int kpu_kmodel_done(kpu_model_context_t *ctx);
	184
	185	volatile kpu_config_t const kpu = (volatile kpu_config_t )AI_BASE_ADDR;
	186	static volatile uint32_t kpu_status;
	187
	188	static void kpu_send_layer(const kpu_layer_argument_t *layer)
	189	{
[458]	190	kpu->layer_argument_fifo = layer->interrupt_enabe.reg;
	191	kpu->layer_argument_fifo = layer->image_addr.reg;
	192	kpu->layer_argument_fifo = layer->image_channel_num.reg;
	193	kpu->layer_argument_fifo = layer->image_size.reg;
	194	kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg;
	195	kpu->layer_argument_fifo = layer->kernel_load_cfg.reg;
	196	kpu->layer_argument_fifo = layer->kernel_offset.reg;
	197	kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg;
	198	kpu->layer_argument_fifo = layer->write_back_cfg.reg;
	199	kpu->layer_argument_fifo = layer->conv_value.reg;
	200	kpu->layer_argument_fifo = layer->conv_value2.reg;
	201	kpu->layer_argument_fifo = layer->dma_parameter.reg;
[453]	202	}
	203
	204	void kpu_input_dma(const kpu_layer_argument_t layer, const uint8_t src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata)
	205	{
[458]	206	uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1);
	207	dmac_set_irq(dma_ch, callback, userdata, 1);
	208	dmac_set_single_mode(dma_ch, (void )src, (void )(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
	209	DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
[453]	210	}
	211
	212	static void kpu_conv2d_core(kpu_layer_argument_t *layer)
	213	{
[458]	214	kpu_send_layer(layer);
[453]	215	}
	216
	217	void kpu_conv2d(kpu_layer_argument_t *layer)
	218	{
[458]	219	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
	220	.calc_done_int = 1,
	221	.layer_cfg_almost_empty_int = 1,
	222	.layer_cfg_almost_full_int = 1};
	223	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
	224	.calc_done_int = 1,
	225	.layer_cfg_almost_empty_int = 0,
	226	.layer_cfg_almost_full_int = 1};
	227	kpu_conv2d_core(layer);
[453]	228	}
	229
	230	void kpu_global_average_pool(const uint8_t src, const quantize_param_t src_param, int kernel_size, int channels, uint8_t dest, const quantize_param_t dest_param)
	231	{
[458]	232	quantize_param_t q1 = src_param, q2 = dest_param;
	233	size_t oc, y, x;
[453]	234
[458]	235	if (((uintptr_t)dest) >= AI_IO_BASE_ADDR && ((uintptr_t)dest) < AI_IO_BASE_ADDR + 2 * 1024 * 1024)
	236	{
	237	uint32_t row_padding = 16;
	238	uint32_t row_group = 4;
	239	uint32_t row_length = 1;
	240	uint32_t height = 4;
[453]	241
[458]	242	for (oc = 0; oc < channels; oc++)
	243	{
	244	uint8_t channel_origin = dest + oc / row_group row_length * height * 64 + oc % row_group * row_padding;
	245	for (y = 0; y < 1; y++)
	246	{
	247	uint8_t y_origin = channel_origin + y row_length * 64;
	248	for (x = 0; x < 1; x++)
	249	{
	250	int64_t sum = 0;
	251	size_t i;
	252	for (i = 0; i < kernel_size; i++)
	253	sum += *src++;
[453]	254
[458]	255	int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
	256	if (value < 0)
	257	value = 0;
	258	if (value > 0xFF)
	259	value = 0xFF;
	260	y_origin[x] = value;
	261	}
	262	}
	263	}
	264	}
	265	else
	266	{
	267	for (oc = 0; oc < channels; oc++)
	268	{
	269	int64_t sum = 0;
	270	size_t i;
	271	for (i = 0; i < kernel_size; i++)
	272	sum += *src++;
[453]	273
[458]	274	int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
	275	if (value < 0)
	276	value = 0;
	277	if (value > 0xFF)
	278	value = 0xFF;
	279	dest[oc] = value;
	280	}
	281	}
[453]	282	}
	283
	284	void kpu_global_average_pool_float(const uint8_t src, const quantize_param_t src_param, int kernel_size, int channels, float *dest)
	285	{
[458]	286	quantize_param_t q = *src_param;
	287	size_t oc;
[453]	288
[458]	289	for (oc = 0; oc < channels; oc++)
	290	{
	291	int64_t sum = 0;
	292	size_t i;
	293	for (i = 0; i < kernel_size; i++)
	294	sum += *src++;
[453]	295
[458]	296	float value = (sum * q.scale + q.bias) / kernel_size;
	297	dest[oc] = value;
	298	}
[453]	299	}
	300
	301	#if USE_CACHED_AI_RAM
	302	static void kpu_flush_cache(uint32_t addr, size_t lines)
	303	{
[458]	304	size_t line;
	305	for (line = 0; line < lines; line++)
	306	{
	307	const uint64_t src = (const uint64_t )(AI_RAM_BASE_ADDR + (addr + line) * 64);
	308	uint64_t dest = (uint64_t )(AI_IO_BASE_ADDR + (addr + line) * 64);
	309	size_t i;
	310	for (i = 0; i < 8; i++)
	311	dest[i] = src[i];
	312	}
[453]	313	}
	314	#endif
	315	static int64_t kpu_carry_shift(int64_t value, uint32_t shift)
	316	{
[458]	317	if (shift > 0)
	318	{
	319	value >>= shift - 1;
	320	if (value & 0x1)
	321	{
	322	if (value < 0)
	323	value = (value >> 1) - 1;
	324	else
	325	value = (value >> 1) + 1;
	326	}
	327	else
	328	{
	329	value >>= 1;
	330	}
	331	}
[453]	332
[458]	333	return value;
[453]	334	}
	335	static void kpu_upload_core(size_t width, size_t height, size_t channels, const uint8_t *src, uint32_t kpu_addr)
	336	{
[458]	337	uint8_t dest = (uint8_t )(uintptr_t)(AI_IO_BASE_ADDR + kpu_addr * 64);
	338	size_t oc, y, x;
	339	uint32_t row_padding;
	340	uint32_t row_group;
	341	uint32_t row_length;
	342	if (width <= 16)
	343	{
	344	row_padding = 16;
	345	row_group = 4;
	346	row_length = 1;
	347	}
	348	else if (width <= 32)
	349	{
	350	row_padding = 32;
	351	row_group = 2;
	352	row_length = 1;
	353	}
	354	else
	355	{
	356	row_padding = 64;
	357	row_group = 1;
	358	row_length = (width + 63) / 64;
	359	}
[453]	360
[458]	361	if ((uintptr_t)src % 8 == 0 && width % 8 == 0)
	362	{
[453]	363	#define UPLOAD_BEGIN() \
[458]	364	for (oc = 0; oc < channels; oc++) \
	365	{ \
	366	uint8_t channel_origin = dest + oc / row_group row_length * height * 64 + oc % row_group * row_padding; \
	367	for (y = 0; y < height; y++) \
	368	{ \
	369	uint64_t y_origin = (uint64_t )(channel_origin + y * row_length * 64);
[453]	370
	371	#define UPLOAD_END() \
[458]	372	} \
	373	}
[453]	374
[458]	375	width /= 8;
	376	const uint64_t u64_src = (const uint64_t )src;
	377	if (width == 1)
	378	{
	379	UPLOAD_BEGIN()
	380	y_origin[0] = *u64_src++;
	381	UPLOAD_END()
	382	}
	383	else if (width == 2)
	384	{
	385	UPLOAD_BEGIN()
	386	{
	387	y_origin[0] = *u64_src++;
	388	y_origin[1] = *u64_src++;
	389	}
	390	UPLOAD_END()
	391	}
	392	else if (width == 4)
	393	{
	394	UPLOAD_BEGIN()
	395	{
	396	y_origin[0] = *u64_src++;
	397	y_origin[1] = *u64_src++;
	398	y_origin[2] = *u64_src++;
	399	y_origin[3] = *u64_src++;
	400	}
	401	UPLOAD_END()
	402	}
	403	else
	404	{
	405	UPLOAD_BEGIN()
	406	for (x = 0; x < width; x++)
	407	y_origin[x] = *u64_src++;
	408	UPLOAD_END()
	409	}
	410	}
	411	else
	412	{
	413	for (oc = 0; oc < channels; oc++)
	414	{
	415	uint8_t channel_origin = dest + oc / row_group row_length * height * 64 + oc % row_group * row_padding;
	416	for (y = 0; y < height; y++)
	417	{
	418	uint8_t y_origin = channel_origin + y row_length * 64;
	419	for (x = 0; x < width; x++)
	420	y_origin[x] = *src++;
	421	}
	422	}
	423	}
[453]	424	}
	425	static void kpu_kmodel_input_with_padding(const kpu_layer_argument_t layer, const uint8_t src)
	426	{
[458]	427	size_t width = layer->image_size.data.i_row_wid + 1;
	428	size_t height = layer->image_size.data.i_col_high + 1;
	429	size_t channels = layer->image_channel_num.data.i_ch_num + 1;
[453]	430
[458]	431	kpu_upload_core(width, height, channels, src, layer->image_addr.data.image_src_addr);
[453]	432	}
	433
	434	static void kpu_kmodel_input_float(const float src, float dest, size_t count)
	435	{
[458]	436	memcpy(dest, src, count * sizeof(float));
[453]	437	}
	438
	439	static void kpu_float_activation(float *data, size_t count, kpu_model_activation_t act)
	440	{
[458]	441	size_t i;
[453]	442
[458]	443	if (act == KLA_RELU)
	444	{
	445	for (i = 0; i < count; i++)
	446	data[i] = max(data[i], 0);
	447	}
	448	else if (act == KLA_RELU6)
	449	{
	450	for (i = 0; i < count; i++)
	451	data[i] = min(max(data[i], 0), 6);
	452	}
[453]	453	}
	454
	455	static void kpu_kmodel_add(const kpu_model_add_layer_argument_t arg, kpu_model_context_t ctx)
	456	{
[458]	457	const float src_a = (const float )(ctx->main_buffer + arg->main_mem_in_a_address);
	458	const float src_b = (const float )(ctx->main_buffer + arg->main_mem_in_b_address);
	459	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	460	size_t i, count = arg->count;
[453]	461
[458]	462	for (i = 0; i < count; i++)
	463	dest[i] = src_a[i] + src_b[i];
[453]	464	}
	465
	466	static void kpu_quantized_add(const kpu_model_quant_add_layer_argument_t arg, kpu_model_context_t ctx)
	467	{
[458]	468	const uint8_t src_a = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_a_address);
	469	const uint8_t src_b = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_b_address);
	470	size_t count = ALIGN_UP(arg->count, 8) / 8;
	471	int64_t off_a = arg->in_a_offset, mul_a = arg->in_a_mul, sh_a = arg->in_a_shift;
	472	int64_t off_b = arg->in_b_offset, mul_b = arg->in_b_mul, sh_b = arg->in_b_shift;
	473	int64_t off_o = arg->out_offset, mul_o = arg->out_mul, sh_o = arg->out_shift;
[453]	474
[458]	475	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
	476	size_t i;
[453]	477
[458]	478	if (sh_a == sh_b)
	479	{
[453]	480	#define QADD_UNROLL_1(x) \
[458]	481	int64_t a##x = *src_a++; \
	482	int64_t b##x = *src_b++;
[453]	483
	484	#define QADD_UNROLL_2(x) \
[458]	485	a##x += off_a; \
	486	b##x += off_b;
[453]	487
	488	#define QADD_UNROLL_3(x) \
[458]	489	a##x *= mul_a; \
	490	b##x *= mul_b;
[453]	491
	492	#define QADD_UNROLL_4(x) \
[458]	493	int64_t v##x = a##x + b##x;
[453]	494
	495	#define QADD_UNROLL_5(x) \
[458]	496	v##x >>= sh_a;
[453]	497
	498	#define QADD_UNROLL_6(x) \
[458]	499	v##x *= mul_o;
[453]	500
	501	#define QADD_UNROLL_7(x) \
[458]	502	v##x = kpu_carry_shift(v##x, sh_o);
[453]	503
	504	#define QADD_UNROLL_8(x) \
[458]	505	v##x += off_o;
[453]	506
	507	#define QADD_UNROLL_9(x) \
[458]	508	v##x = min(0xFF, max(0, v##x));
[453]	509
	510	#define QADD_UNROLL_10(x) \
[458]	511	*dest++ = v##x;
[453]	512
	513	#define QADD_UNROLL_S(x) \
[458]	514	QADD_UNROLL_##x(0) \
	515	QADD_UNROLL_##x(1) \
	516	QADD_UNROLL_##x(2) \
	517	QADD_UNROLL_##x(3) \
	518	QADD_UNROLL_##x(4) \
	519	QADD_UNROLL_##x(5) \
	520	QADD_UNROLL_##x(6) \
	521	QADD_UNROLL_##x(7)
[453]	522
[458]	523	for (i = 0; i < count; i++)
	524	{
	525	QADD_UNROLL_S(1);
	526	QADD_UNROLL_S(2);
	527	QADD_UNROLL_S(3);
	528	QADD_UNROLL_S(4);
	529	QADD_UNROLL_S(5);
	530	QADD_UNROLL_S(6);
	531	QADD_UNROLL_S(7);
	532	QADD_UNROLL_S(8);
	533	QADD_UNROLL_S(9);
	534	QADD_UNROLL_S(10);
	535	}
	536	}
	537	else
	538	{
[453]	539	#undef QADD_UNROLL_1
	540	#define QADD_UNROLL_1(x) \
[458]	541	int64_t a##x = *src_a++; \
	542	int64_t b##x = *src_b++;
[453]	543
	544	#undef QADD_UNROLL_2
	545	#define QADD_UNROLL_2(x) \
[458]	546	a##x += off_a; \
	547	b##x += off_b;
[453]	548
	549	#undef QADD_UNROLL_3
	550	#define QADD_UNROLL_3(x) \
[458]	551	a##x *= mul_a; \
	552	b##x *= mul_b;
[453]	553
	554	#undef QADD_UNROLL_4
	555	#define QADD_UNROLL_4(x) \
[458]	556	a##x >>= sh_a; \
	557	b##x >>= sh_b;
[453]	558
	559	#undef QADD_UNROLL_5
	560	#define QADD_UNROLL_5(x) \
[458]	561	int64_t v##x = a##x + b##x;
[453]	562
	563	#undef QADD_UNROLL_6
	564	#define QADD_UNROLL_6(x) \
[458]	565	v##x *= mul_o;
[453]	566
	567	#undef QADD_UNROLL_7
	568	#define QADD_UNROLL_7(x) \
[458]	569	v##x = kpu_carry_shift(v##x, sh_o);
[453]	570
	571	#undef QADD_UNROLL_8
	572	#define QADD_UNROLL_8(x) \
[458]	573	v##x += off_o;
[453]	574
	575	#undef QADD_UNROLL_9
	576	#define QADD_UNROLL_9(x) \
[458]	577	v##x = min(0xFF, max(0, v##x));
[453]	578
	579	#undef QADD_UNROLL_10
	580	#define QADD_UNROLL_10(x) \
[458]	581	*dest++ = v##x;
[453]	582
	583	#undef QADD_UNROLL_S
	584	#define QADD_UNROLL_S(x) \
[458]	585	QADD_UNROLL_##x(0) \
	586	QADD_UNROLL_##x(1) \
	587	QADD_UNROLL_##x(2) \
	588	QADD_UNROLL_##x(3) \
	589	QADD_UNROLL_##x(4) \
	590	QADD_UNROLL_##x(5) \
	591	QADD_UNROLL_##x(6) \
	592	QADD_UNROLL_##x(7)
[453]	593
[458]	594	for (i = 0; i < count; i++)
	595	{
	596	QADD_UNROLL_S(1);
	597	QADD_UNROLL_S(2);
	598	QADD_UNROLL_S(3);
	599	QADD_UNROLL_S(4);
	600	QADD_UNROLL_S(5);
	601	QADD_UNROLL_S(6);
	602	QADD_UNROLL_S(7);
	603	QADD_UNROLL_S(8);
	604	QADD_UNROLL_S(9);
	605	QADD_UNROLL_S(10);
	606	}
	607	}
[453]	608	}
	609
	610	static void kpu_global_average_pool2d(const kpu_model_gap2d_layer_argument_t arg, kpu_model_context_t ctx)
	611	{
[458]	612	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
	613	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	614	size_t oc, channels = arg->channels, kernel_size = arg->kernel_size;
[453]	615
[458]	616	for (oc = 0; oc < channels; oc++)
	617	{
	618	float sum = 0.f;
	619	size_t i;
	620	for (i = 0; i < kernel_size; i++)
	621	sum += *src++;
[453]	622
[458]	623	dest[oc] = sum / kernel_size;
	624	}
[453]	625	}
	626
	627	static void kpu_quantized_max_pool2d(const kpu_model_quant_max_pool2d_layer_argument_t arg, kpu_model_context_t ctx)
	628	{
[458]	629	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
	630	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
	631	kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
	632	uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
	633	uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
	634	uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
[453]	635
[458]	636	uint32_t out_y, out_x, oc;
[453]	637
[458]	638	for (oc = 0; oc < out_shape.channels; oc++)
	639	{
	640	const uint8_t channel_src = src + in_shape.width in_shape.height * oc;
	641	for (out_y = 0; out_y < out_shape.height; out_y++)
	642	{
	643	for (out_x = 0; out_x < out_shape.width; out_x++)
	644	{
	645	int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
	646	int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
	647	int32_t kernel_x_start = max(0, -in_x_origin);
	648	int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
	649	int32_t kernel_y_start = max(0, -in_y_origin);
	650	int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
	651	uint8_t value = 0;
[453]	652
[458]	653	int32_t kernel_y, kernel_x;
	654	for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
	655	{
	656	for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
	657	{
	658	int32_t in_x = in_x_origin + kernel_x;
	659	int32_t in_y = in_y_origin + kernel_y;
	660	value = max(value, channel_src[in_y * in_shape.width + in_x]);
	661	}
	662	}
[453]	663
[458]	664	*dest++ = value;
	665	}
	666	}
	667	}
[453]	668	}
	669
	670	static void kpu_average_pool2d(const kpu_model_ave_pool2d_layer_argument_t arg, kpu_model_context_t ctx)
	671	{
[458]	672	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
	673	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	674	kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
	675	uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
	676	uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
	677	uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
[453]	678
[458]	679	uint32_t out_y, out_x, oc;
[453]	680
[458]	681	for (oc = 0; oc < out_shape.channels; oc++)
	682	{
	683	const float channel_src = src + in_shape.width in_shape.height * oc;
	684	for (out_y = 0; out_y < out_shape.height; out_y++)
	685	{
	686	for (out_x = 0; out_x < out_shape.width; out_x++)
	687	{
	688	int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
	689	int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
	690	int32_t kernel_x_start = max(0, -in_x_origin);
	691	int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
	692	int32_t kernel_y_start = max(0, -in_y_origin);
	693	int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
	694	float value = 0;
	695	float kernel_count = 0;
[453]	696
[458]	697	int32_t kernel_y, kernel_x;
	698	for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
	699	{
	700	for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
	701	{
	702	int32_t in_x = in_x_origin + kernel_x;
	703	int32_t in_y = in_y_origin + kernel_y;
	704	value += channel_src[in_y * in_shape.width + in_x];
	705	kernel_count++;
	706	}
	707	}
[453]	708
[458]	709	*dest++ = value / kernel_count;
	710	}
	711	}
	712	}
[453]	713	}
	714
	715	static void kpu_quantize(const kpu_model_quantize_layer_argument_t arg, kpu_model_context_t ctx)
	716	{
[458]	717	size_t count = arg->count;
	718	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
[453]	719
[458]	720	kpu_model_quant_param_t q = arg->quant_param;
[453]	721
[458]	722	float scale = 1.f / q.scale;
[453]	723
[458]	724	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->mem_out_address);
	725	size_t i;
	726	for (i = 0; i < count; i++)
	727	{
	728	int value = roundf((src++ - q.bias) scale);
	729	if (value < 0)
	730	value = 0;
	731	if (value > 0xFF)
	732	value = 0xFF;
	733	*dest++ = (uint8_t)value;
	734	}
[453]	735	}
	736
	737	static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t arg, kpu_model_context_t ctx)
	738	{
[458]	739	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
	740	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	741	size_t oc, count = arg->count;
	742	kpu_model_quant_param_t q = arg->quant_param;
[453]	743
[458]	744	for (oc = 0; oc < count; oc++)
	745	dest[oc] = src++ q.scale + q.bias;
[453]	746	}
	747
	748	static void kpu_kmodel_channelwise_dequantize(const kpu_model_channelwise_dequant_argument_t arg, kpu_model_context_t ctx)
	749	{
[458]	750	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
	751	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	752	size_t oc, i, channels = arg->channels, count = arg->channel_size;
[453]	753
[458]	754	for (oc = 0; oc < channels; oc++)
	755	{
	756	const kpu_model_quant_param_t q = arg->quant_params[oc];
[453]	757
[458]	758	for (i = 0; i < count; i++)
	759	dest++ = src++ * q.scale + q.bias;
	760	}
[453]	761	}
	762
	763	static void kpu_requantize(const kpu_model_requantize_layer_argument_t arg, kpu_model_context_t ctx)
	764	{
[458]	765	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
	766	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
	767	size_t oc, count = arg->count;
	768	const uint8_t *table = arg->table;
[453]	769
[458]	770	if (false && count % 8 == 0)
	771	{
	772	for (oc = 0; oc < count;)
	773	{
	774	dest[oc++] = table[*src++];
	775	dest[oc++] = table[*src++];
	776	dest[oc++] = table[*src++];
	777	dest[oc++] = table[*src++];
	778	dest[oc++] = table[*src++];
	779	dest[oc++] = table[*src++];
	780	dest[oc++] = table[*src++];
	781	dest[oc++] = table[*src++];
	782	}
	783	}
	784	else
	785	{
	786	for (oc = 0; oc < count; oc++)
	787	dest[oc] = table[src[oc]];
	788	}
[453]	789	}
	790
	791	static void kpu_l2_normalization(const kpu_model_l2_norm_layer_argument_t arg, kpu_model_context_t ctx)
	792	{
[458]	793	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
	794	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	795	size_t oc, channels = arg->channels;
[453]	796
[458]	797	float sum = 0.f;
	798	const float epsilon = 1e-10f;
	799	for (oc = 0; oc < channels; oc++)
	800	sum += src[oc] * src[oc];
	801	if (sum < epsilon)
	802	sum = epsilon;
	803	sum = 1.f / sqrtf(sum);
	804	for (oc = 0; oc < channels; oc++)
	805	dest[oc] = src[oc] * sum;
[453]	806	}
	807
	808	static void kpu_softmax(const kpu_model_softmax_layer_argument_t arg, kpu_model_context_t ctx)
	809	{
[458]	810	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
	811	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	812	size_t oc, channels = arg->channels;
[453]	813
[458]	814	float max = FLT_MIN;
	815	for (oc = 0; oc < channels; oc++)
	816	max = fmaxf(max, src[oc]);
[453]	817
[458]	818	float sum = 0.f;
	819	for (oc = 0; oc < channels; oc++)
	820	{
	821	float value = expf(src[oc] - max);
	822	sum += value;
	823	dest[oc] = value;
	824	}
[453]	825
[458]	826	for (oc = 0; oc < channels; oc++)
	827	dest[oc] /= sum;
[453]	828	}
	829
	830	static void kpu_concat(const kpu_model_concat_layer_argument_t arg, kpu_model_context_t ctx)
	831	{
[458]	832	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
	833	uint32_t count = arg->input_count, i;
[453]	834
[458]	835	for (i = 0; i < count; i++)
	836	{
	837	kpu_model_memory_range_t input = arg->inputs_mem[i];
	838	const uint8_t src = (const uint8_t )(ctx->main_buffer + input.start);
	839	memcpy(dest, src, input.size);
	840	dest += input.size;
	841	}
[453]	842	}
	843
	844	static void kpu_kmodel_fully_connected(const kpu_model_fully_connected_layer_argument_t arg, kpu_model_context_t ctx)
	845	{
[458]	846	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
	847	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	848	uint32_t in_channels = arg->in_channels, out_channels = arg->out_channels, ic, oc;
	849	float weights = (float )malloc(in_channels * out_channels * sizeof(float));
	850	float bias = (float )malloc(out_channels * sizeof(float));
	851	memcpy(weights, arg->weights, out_channels * in_channels * sizeof(float));
	852	memcpy(bias, arg->weights + in_channels * out_channels, out_channels * sizeof(float));
[453]	853
[458]	854	if (in_channels % 8 == 0)
	855	{
[453]	856	#define FC_UNROLL_1(x) \
[458]	857	float i##x = *c_src++; \
	858	float w##x = *c_weights++;
[453]	859
	860	#define FC_UNROLL_2(x) \
[458]	861	sum += i##x * w##x;
[453]	862
	863	#define FC_UNROLL_S(x) \
[458]	864	FC_UNROLL_##x(0) \
	865	FC_UNROLL_##x(1) \
	866	FC_UNROLL_##x(2) \
	867	FC_UNROLL_##x(3) \
	868	FC_UNROLL_##x(4) \
	869	FC_UNROLL_##x(5) \
	870	FC_UNROLL_##x(6) \
	871	FC_UNROLL_##x(7)
[453]	872
[458]	873	for (oc = 0; oc < out_channels; oc++)
	874	{
	875	const float *c_src = src;
	876	const float c_weights = weights + oc in_channels;
[453]	877
[458]	878	float sum = 0.0f;
	879	for (ic = 0; ic < in_channels / 8; ic++)
	880	{
	881	FC_UNROLL_S(1);
	882	FC_UNROLL_S(2);
	883	}
[453]	884
[458]	885	dest[oc] = sum + bias[oc];
	886	}
	887	}
	888	else
	889	{
	890	for (oc = 0; oc < out_channels; oc++)
	891	{
	892	const float c_weights = weights + oc in_channels;
[453]	893
[458]	894	float sum = 0.0f;
	895	for (ic = 0; ic < in_channels; ic++)
	896	sum += src[ic] * c_weights[ic];
	897	dest[oc] = sum + bias[oc];
	898	}
	899	}
	900	free(weights);
	901	free(bias);
	902	kpu_float_activation(dest, out_channels, arg->act);
[453]	903	}
	904
	905	static void kpu_tf_flatten(const kpu_model_tf_flatten_layer_argument_t arg, kpu_model_context_t ctx)
	906	{
[458]	907	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
	908	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	909	kpu_model_shape_t in_shape = arg->shape;
	910	uint32_t oc, oy, ox;
[453]	911
[458]	912	for (oy = 0; oy < in_shape.height; oy++)
	913	for (ox = 0; ox < in_shape.width; ox++)
	914	for (oc = 0; oc < in_shape.channels; oc++)
	915	dest++ = src[(oc in_shape.height + oy) * in_shape.width + ox];
[453]	916	}
	917
	918	static void kpu_resize_nearest_neighbor(const kpu_model_resize_nearest_neighbor_layer_argument_t arg, kpu_model_context_t ctx)
	919	{
[458]	920	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
	921	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	922	kpu_model_shape_t in_shape = arg->in_shape;
	923	uint32_t out_width = arg->out_width, out_height = arg->out_height;
	924	uint32_t oc, oy, ox;
[453]	925
[458]	926	float height_scale = (float)in_shape.height / out_height;
	927	float width_scale = (float)in_shape.width / out_width;
[453]	928
[458]	929	for (oc = 0; oc < in_shape.channels; oc++)
	930	{
	931	const float channel_src = src + in_shape.width in_shape.height * oc;
	932	for (oy = 0; oy < out_height; oy++)
	933	{
	934	uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
	935	const float y_origin = channel_src + in_y in_shape.width;
	936	for (ox = 0; ox < out_width; ox++)
	937	{
	938	uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
	939	*dest++ = y_origin[in_x];
	940	}
	941	}
	942	}
[453]	943	}
	944
	945	static void kpu_quant_resize_nearest_neighbor(const kpu_model_quant_resize_nearest_neighbor_layer_argument_t arg, kpu_model_context_t ctx)
	946	{
[458]	947	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
	948	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
	949	kpu_model_shape_t in_shape = arg->in_shape;
	950	uint32_t out_width = arg->out_width, out_height = arg->out_height;
	951	uint32_t oc, oy, ox;
[453]	952
[458]	953	float height_scale = (float)in_shape.height / out_height;
	954	float width_scale = (float)in_shape.width / out_width;
[453]	955
[458]	956	for (oc = 0; oc < in_shape.channels; oc++)
	957	{
	958	const uint8_t channel_src = src + in_shape.width in_shape.height * oc;
	959	for (oy = 0; oy < out_height; oy++)
	960	{
	961	uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
	962	const uint8_t y_origin = channel_src + in_y in_shape.width;
	963	for (ox = 0; ox < out_width; ox++)
	964	{
	965	uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
	966	*dest++ = y_origin[in_x];
	967	}
	968	}
	969	}
[453]	970	}
	971
	972	static void kpu_logistic(const kpu_model_logistic_layer_argument_t arg, kpu_model_context_t ctx)
	973	{
[458]	974	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
	975	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
	976	size_t oc, channels = arg->channels;
[453]	977
[458]	978	for (oc = 0; oc < channels; oc++)
	979	dest[oc] = 1.f / (1.f + expf(-src[oc]));
[453]	980	}
	981
	982	static void kpu_conv(const kpu_model_conv_layer_argument_t arg, kpu_model_context_t ctx)
	983	{
[458]	984	volatile kpu_layer_argument_t layer = (const volatile kpu_layer_argument_t )(ctx->model_buffer + arg->layer_offset);
	985	layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - IOMEM;
	986	layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - IOMEM;
	987	layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - IOMEM;
[453]	988
[458]	989	if (arg->flags & KLF_MAIN_MEM_OUT)
	990	{
	991	dmac_channel_number_t dma_ch = ctx->dma_ch;
	992	uint8_t *dest = ctx->main_buffer + arg->main_mem_out_address;
	993	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
	994	.calc_done_int = 1,
	995	.layer_cfg_almost_empty_int = 1,
	996	.layer_cfg_almost_full_int = 1};
	997	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
	998	.calc_done_int = 1,
	999	.layer_cfg_almost_empty_int = 1,
	1000	.layer_cfg_almost_full_int = 1};
	1001	layer.dma_parameter.data.send_data_out = 1;
	1002	select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
	1003	if (ctx->current_layer < ctx->layers_length)
	1004	dmac_set_irq(dma_ch, ai_step, ctx, 1);
	1005	else
	1006	dmac_set_irq(dma_ch, (plic_irq_callback_t)kpu_kmodel_done, ctx, 1);
	1007	dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
	1008	DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8);
	1009	}
	1010	else
	1011	{
	1012	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
	1013	.calc_done_int = 1,
	1014	.layer_cfg_almost_empty_int = 1,
	1015	.layer_cfg_almost_full_int = 1};
[453]	1016
[458]	1017	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
	1018	.calc_done_int = 0,
	1019	.layer_cfg_almost_empty_int = 1,
	1020	.layer_cfg_almost_full_int = 1};
	1021	layer.interrupt_enabe.data.int_en = 1;
	1022	}
[453]	1023
[458]	1024	kpu_send_layer((const kpu_layer_argument_t *)&layer);
[453]	1025	}
	1026
	1027	static void kpu_add_padding(const kpu_model_add_padding_layer_argument_t arg, kpu_model_context_t ctx)
	1028	{
[458]	1029	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
[453]	1030	#if USE_CACHED_AI_RAM
[458]	1031	uint8_t dest = (uint8_t )(uintptr_t)(AI_RAM_BASE_ADDR + arg->kpu_mem_out_address * 64);
[453]	1032	#else
[458]	1033	uint8_t dest = (uint8_t )(uintptr_t)(AI_IO_BASE_ADDR + arg->kpu_mem_out_address * 64);
[453]	1034	#endif
	1035
[458]	1036	uint32_t row_padding = 16;
	1037	uint32_t row_group = 4;
	1038	uint32_t row_length = 1;
	1039	uint32_t height = 4;
	1040	uint32_t oc, x, y, channels = arg->channels;
[453]	1041
[458]	1042	for (oc = 0; oc < channels; oc++)
	1043	{
	1044	uint8_t channel_origin = dest + oc / row_group row_length * height * 64 + oc % row_group * row_padding;
	1045	for (y = 0; y < 1; y++)
	1046	{
	1047	uint8_t y_origin = channel_origin + y row_length * 64;
	1048	for (x = 0; x < 1; x++)
	1049	y_origin[x] = *src++;
	1050	}
	1051	}
[453]	1052
	1053	#if USE_CACHED_AI_RAM
[458]	1054	uint32_t lines = row_length * height * channels / row_group;
	1055	kpu_flush_cache(arg->kpu_mem_out_address, lines);
[453]	1056	#endif
	1057	}
	1058
	1059	static void kpu_remove_padding(const kpu_model_remove_padding_layer_argument_t arg, kpu_model_context_t ctx)
	1060	{
[458]	1061	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
	1062	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
	1063	uint32_t oc, channels = arg->channels;
[453]	1064
[458]	1065	for (oc = 0; oc < channels; oc++)
	1066	dest++ = src[oc 16];
[453]	1067	}
	1068
	1069	static void kpu_upload(const kpu_model_upload_layer_argument_t arg, kpu_model_context_t ctx)
	1070	{
[458]	1071	size_t width = arg->width;
	1072	size_t height = arg->height;
	1073	size_t channels = arg->channels;
[453]	1074
[458]	1075	kpu_upload_core(width, height, channels, ctx->main_buffer + arg->main_mem_in_address, arg->kpu_mem_out_address);
[453]	1076	}
	1077
	1078	int kpu_load_kmodel(kpu_model_context_t ctx, const uint8_t buffer)
	1079	{
	1080	#if FIX_CACHE
[458]	1081	configASSERT(is_memory_cache((uintptr_t)buffer));
[453]	1082	#endif
[458]	1083	uintptr_t base_addr = (uintptr_t)buffer;
	1084	const kpu_kmodel_header_t header = (const kpu_kmodel_header_t )buffer;
[453]	1085
[458]	1086	if (header->version == 3 && header->arch == 0)
	1087	{
	1088	ctx->model_buffer = buffer;
	1089	ctx->output_count = header->output_count;
	1090	ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t));
	1091	ctx->layer_headers = (const kpu_model_layer_header_t )((uintptr_t)ctx->outputs + sizeof(kpu_model_output_t) ctx->output_count);
	1092	ctx->layers_length = header->layers_length;
	1093	ctx->body_start = (const uint8_t )((uintptr_t)ctx->layer_headers + sizeof(kpu_model_layer_header_t) header->layers_length);
	1094	ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage);
	1095	if (!ctx->main_buffer)
	1096	return -1;
	1097	uint32_t body_size = 0;
	1098	for (int i = 0; i < ctx->layers_length; i++)
	1099	{
	1100	const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + i;
	1101	body_size += cnt_layer_header->body_size;
	1102	}
	1103	uint8_t body_start_iomem = (uint8_t )((uintptr_t)ctx->body_start - IOMEM);
	1104	const uint8_t *body_start_cache = ctx->body_start;
	1105	memcpy(body_start_iomem, body_start_cache, body_size);
	1106	for (int i = 0; i < body_size; i++)
	1107	{
	1108	configASSERT(body_start_iomem[i] == body_start_cache[i]);
	1109	}
	1110	}
	1111	else
	1112	{
	1113	return -1;
	1114	}
[453]	1115
[458]	1116	return 0;
[453]	1117	}
	1118
	1119	int kpu_get_output(kpu_model_context_t ctx, uint32_t index, uint8_t data, size_t size)
	1120	{
[458]	1121	if (index >= ctx->output_count)
	1122	return -1;
[453]	1123
[458]	1124	const kpu_model_output_t *output = ctx->outputs + index;
	1125	*data = ctx->main_buffer + output->address;
	1126	*size = output->size;
	1127	return 0;
[453]	1128	}
	1129
	1130	void kpu_model_free(kpu_model_context_t *ctx)
	1131	{
[458]	1132	free(ctx->main_buffer);
	1133	ctx->main_buffer = NULL;
[453]	1134	}
	1135
	1136	#if KPU_DEBUG
	1137	static uint64_t last_time;
	1138	static uint64_t total_time;
	1139	static uint64_t kpu_time;
	1140	static uint32_t last_layer_type;
	1141
	1142	static const char *str_layer_type(uint32_t type)
	1143	{
[458]	1144	switch (type)
	1145	{
	1146	case KL_ADD:
	1147	return "Add";
	1148	case KL_QUANTIZED_ADD:
	1149	return "QuantAdd";
	1150	case KL_GLOBAL_AVERAGE_POOL2D:
	1151	return "GAP";
	1152	case KL_QUANTIZED_MAX_POOL2D:
	1153	return "QuantMaxPool2d";
	1154	case KL_AVERAGE_POOL2D:
	1155	return "AveragePool2d";
	1156	case KL_QUANTIZE:
	1157	return "Quantize";
	1158	case KL_DEQUANTIZE:
	1159	return "Dequantize";
	1160	case KL_REQUANTIZE:
	1161	return "Requantize";
	1162	case KL_L2_NORMALIZATION:
	1163	return "L2Norm";
	1164	case KL_SOFTMAX:
	1165	return "Softmax";
	1166	case KL_CONCAT:
	1167	return "Concat";
	1168	case KL_QUANTIZED_CONCAT:
	1169	return "QuantConcat";
	1170	case KL_FULLY_CONNECTED:
	1171	return "FullyConnected";
	1172	case KL_TENSORFLOW_FLATTEN:
	1173	return "TFFlatten";
	1174	case KL_RESIZE_NEAREST_NEIGHBOR:
	1175	return "ResizeNearestNeighbor";
	1176	case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
	1177	return "QuantResizeNearestNeighbor";
	1178	case KL_CHANNELWISE_DEQUANTIZE:
	1179	return "ChannelwiseDequantize";
	1180	case KL_LOGISTIC:
	1181	return "Logistic";
	1182	case KL_K210_CONV:
	1183	return "K210Conv";
	1184	case KL_K210_ADD_PADDING:
	1185	return "K210AddPad";
	1186	case KL_K210_REMOVE_PADDING:
	1187	return "K210RemovePad";
	1188	case KL_K210_UPLOAD:
	1189	return "K210Upload";
	1190	default:
	1191	return "Unknown";
	1192	}
[453]	1193	}
	1194	#endif
	1195
	1196	static int kpu_kmodel_done(kpu_model_context_t *ctx)
	1197	{
[458]	1198	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
	1199	.calc_done_int = 1,
	1200	.layer_cfg_almost_empty_int = 1,
	1201	.layer_cfg_almost_full_int = 1};
	1202	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
	1203	.calc_done_int = 1,
	1204	.layer_cfg_almost_empty_int = 1,
	1205	.layer_cfg_almost_full_int = 1};
[453]	1206	#if KPU_DEBUG
[458]	1207	uint32_t cnt_layer_id = ctx->current_layer;
	1208	uint64_t time = sysctl_get_time_us();
	1209	if (last_time != 0)
	1210	{
	1211	uint64_t layer_time = time - last_time;
	1212	syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000);
	1213	total_time += layer_time;
	1214	if (last_layer_type == KL_K210_CONV)
	1215	kpu_time += layer_time;
	1216	}
[453]	1217
[458]	1218	syslog(LOG_NOTICE, "KPU: %d.%03d ms", kpu_time / 1000, kpu_time % 1000);
	1219	syslog(LOG_NOTICE, "CPU: %d.%03d ms", (total_time - kpu_time) / 1000, (total_time - kpu_time) % 1000);
	1220	syslog(LOG_NOTICE, "Model: %d.%03d ms", total_time / 1000, total_time % 1000);
[453]	1221	#endif
[458]	1222	ctx->done_callback(ctx->userdata);
	1223	return 0;
[453]	1224	}
	1225
	1226	static int ai_step(void *userdata)
	1227	{
[458]	1228	kpu_model_context_t ctx = (kpu_model_context_t )userdata;
[453]	1229
[458]	1230	uint32_t cnt_layer_id = ctx->current_layer;
	1231	const uint8_t *layer_body = ctx->current_body;
	1232	const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + cnt_layer_id;
	1233	if (cnt_layer_id >= ctx->layers_length)
	1234	{
	1235	//syslog(LOG_NOTICE, "overrun");
	1236	kpu_kmodel_done(ctx);
	1237	return -1;
	1238	}
[453]	1239
[458]	1240	ctx->current_layer++;
	1241	ctx->current_body += cnt_layer_header->body_size;
[453]	1242
	1243	#if KPU_DEBUG
[458]	1244	uint64_t time = sysctl_get_time_us();
	1245	if (last_time != 0)
	1246	{
	1247	uint64_t layer_time = time - last_time;
	1248	syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000);
	1249	total_time += layer_time;
	1250	if (last_layer_type == KL_K210_CONV)
	1251	kpu_time += layer_time;
	1252	}
[453]	1253
[458]	1254	last_layer_type = cnt_layer_header->type;
	1255	last_time = sysctl_get_time_us();
[453]	1256	#endif
	1257
[458]	1258	switch (cnt_layer_header->type)
	1259	{
	1260	case KL_ADD:
	1261	kpu_kmodel_add((const kpu_model_add_layer_argument_t *)layer_body, ctx);
	1262	break;
	1263	case KL_QUANTIZED_ADD:
	1264	kpu_quantized_add((const kpu_model_quant_add_layer_argument_t *)layer_body, ctx);
	1265	break;
	1266	case KL_GLOBAL_AVERAGE_POOL2D:
	1267	kpu_global_average_pool2d((const kpu_model_gap2d_layer_argument_t *)layer_body, ctx);
	1268	break;
	1269	case KL_QUANTIZED_MAX_POOL2D:
	1270	kpu_quantized_max_pool2d((const kpu_model_quant_max_pool2d_layer_argument_t *)layer_body, ctx);
	1271	break;
	1272	case KL_AVERAGE_POOL2D:
	1273	kpu_average_pool2d((const kpu_model_ave_pool2d_layer_argument_t *)layer_body, ctx);
	1274	break;
	1275	case KL_QUANTIZE:
	1276	kpu_quantize((const kpu_model_quantize_layer_argument_t *)layer_body, ctx);
	1277	break;
	1278	case KL_DEQUANTIZE:
	1279	kpu_kmodel_dequantize((const kpu_model_dequantize_layer_argument_t *)layer_body, ctx);
	1280	break;
	1281	case KL_REQUANTIZE:
	1282	kpu_requantize((const kpu_model_requantize_layer_argument_t *)layer_body, ctx);
	1283	break;
	1284	case KL_L2_NORMALIZATION:
	1285	kpu_l2_normalization((const kpu_model_l2_norm_layer_argument_t *)layer_body, ctx);
	1286	break;
	1287	case KL_SOFTMAX:
	1288	kpu_softmax((const kpu_model_softmax_layer_argument_t *)layer_body, ctx);
	1289	break;
	1290	case KL_CONCAT:
	1291	case KL_QUANTIZED_CONCAT:
	1292	kpu_concat((const kpu_model_concat_layer_argument_t *)layer_body, ctx);
	1293	break;
	1294	case KL_FULLY_CONNECTED:
	1295	kpu_kmodel_fully_connected((const kpu_model_fully_connected_layer_argument_t *)layer_body, ctx);
	1296	break;
	1297	case KL_TENSORFLOW_FLATTEN:
	1298	kpu_tf_flatten((const kpu_model_tf_flatten_layer_argument_t *)layer_body, ctx);
	1299	break;
	1300	case KL_RESIZE_NEAREST_NEIGHBOR:
	1301	kpu_resize_nearest_neighbor((const kpu_model_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
	1302	break;
	1303	case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
	1304	kpu_quant_resize_nearest_neighbor((const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
	1305	break;
	1306	case KL_CHANNELWISE_DEQUANTIZE:
	1307	kpu_kmodel_channelwise_dequantize((const kpu_model_channelwise_dequant_argument_t *)layer_body, ctx);
	1308	break;
	1309	case KL_LOGISTIC:
	1310	kpu_logistic((const kpu_model_logistic_layer_argument_t *)layer_body, ctx);
	1311	break;
	1312	case KL_K210_CONV:
	1313	kpu_conv((const kpu_model_conv_layer_argument_t *)layer_body, ctx);
	1314	return 0;
	1315	case KL_K210_ADD_PADDING:
	1316	kpu_add_padding((const kpu_model_add_padding_layer_argument_t *)layer_body, ctx);
	1317	break;
	1318	case KL_K210_REMOVE_PADDING:
	1319	kpu_remove_padding((const kpu_model_remove_padding_layer_argument_t *)layer_body, ctx);
	1320	break;
	1321	case KL_K210_UPLOAD:
	1322	kpu_upload((const kpu_model_upload_layer_argument_t *)layer_body, ctx);
	1323	break;
	1324	default:
	1325	assert(!"Layer is not supported.");
	1326	kpu_kmodel_done(ctx);
	1327	return -1;
	1328	}
[453]	1329
[458]	1330	if (ctx->current_layer < (ctx->layers_length - 1))
	1331	ai_step(userdata);
	1332	else
	1333	kpu_kmodel_done(ctx);
	1334	return 0;
[453]	1335	}
	1336
	1337	static void ai_step_not_isr(void *userdata)
	1338	{
[458]	1339	dis_int(INTNO_DMAAI);
	1340	dis_int(INTNO_AI);
	1341
	1342	ai_step(userdata);
	1343
	1344	ena_int(INTNO_DMAAI);
	1345	ena_int(INTNO_AI);
[453]	1346	}
	1347
	1348	int kpu_run_kmodel(kpu_model_context_t ctx, const uint8_t src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
	1349	{
[458]	1350	ctx->dma_ch = dma_ch;
	1351	ctx->done_callback = done_callback;
	1352	ctx->userdata = userdata;
	1353	ctx->current_layer = 0;
	1354	ctx->current_body = ctx->body_start;
[453]	1355	#if KPU_DEBUG
[458]	1356	last_time = 0;
	1357	total_time = 0;
	1358	kpu_time = 0;
[453]	1359	#endif
	1360
[458]	1361	kpu_kmodel_header_t header = (kpu_kmodel_header_t )ctx->model_buffer;
	1362	kpu->interrupt_clear.reg = 7;
	1363	kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
	1364	.fifo_full_threshold = 10, .fifo_empty_threshold = 1};
	1365	kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
	1366	.eight_bit_mode = header->flags & 1};
	1367	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
	1368	.calc_done_int = 1,
	1369	.layer_cfg_almost_empty_int = 0,
	1370	.layer_cfg_almost_full_int = 1};
[453]	1371
[458]	1372	//plic_set_priority(INTNO_AI, 1);
	1373	plic_irq_register(INTNO_AI, ai_step, ctx);
	1374	plic_irq_enable(INTNO_AI);
[453]	1375
[458]	1376	const kpu_model_layer_header_t *first_layer_header = ctx->layer_headers;
[453]	1377
[458]	1378	switch (first_layer_header->type)
	1379	{
	1380	case KL_K210_CONV:
	1381	{
	1382	const kpu_model_conv_layer_argument_t first_layer = (const kpu_model_conv_layer_argument_t )ctx->body_start;
	1383	kpu_layer_argument_t layer_arg = (volatile kpu_layer_argument_t )(ctx->model_buffer + first_layer->layer_offset);
[453]	1384
[458]	1385	if ((layer_arg.image_size.data.i_row_wid + 1) % 64 != 0)
	1386	{
	1387	kpu_kmodel_input_with_padding(&layer_arg, src);
	1388	ai_step_not_isr(ctx);
	1389	}
	1390	else
	1391	{
	1392	kpu_input_dma(&layer_arg, src, ctx->dma_ch, ai_step, ctx);
	1393	}
	1394	}
	1395	break;
	1396	case KL_FULLY_CONNECTED:
	1397	{
	1398	const kpu_model_fully_connected_layer_argument_t first_layer = (const kpu_model_fully_connected_layer_argument_t )ctx->body_start;
	1399	kpu_kmodel_input_float((const float )src, (float )(ctx->main_buffer + first_layer->main_mem_in_address), first_layer->in_channels);
	1400	ai_step_not_isr(ctx);
	1401	}
	1402	break;
	1403	default:
	1404	return -1;
	1405	}
[453]	1406
[458]	1407	return 0;
[453]	1408	}
[458]	1409
	1410	ER kpu_init(kpu_model_context_t *ctx)
	1411	{
	1412	g_ai_hdma.chnum = AI_DMA_CH;
	1413	g_ai_hdma.xfercallback = ai_dma_done_isr;
	1414	g_ai_hdma.errorcallback = NULL;
	1415	g_ai_hdma.Init.Request = DMA_SELECT_AI_RX_REQ; /* DMA選択 */
	1416	g_ai_hdma.Init.Direction = DMA_PERIPH_TO_MEMORY; /* DMA転送方向 */
	1417	g_ai_hdma.Init.SrcMultBlock = DMAC_MULTBLOCK_CONT; /* ソースマルチブロックタイプ */
	1418	g_ai_hdma.Init.DrcMultBlock = DMAC_MULTBLOCK_CONT; /* デスティネーションマルチブロックタイプ */
	1419	g_ai_hdma.Init.SrcHandShake = DMAC_HS_HARDWARE; /* ソースハンドシェイク */
	1420	g_ai_hdma.Init.DrcHandShake = DMAC_HS_SOFTWARE; /* デスティネーションハンドシェイク */
	1421	g_ai_hdma.Init.SrcHwhsPol = DMAC_HWHS_POLARITY_LOW; /* ソースハードウェアハンドシェイク極性 */
	1422	g_ai_hdma.Init.DrcHwhsPol = DMAC_HWHS_POLARITY_LOW; /* デスティネーションハードウェアハンドシェイク極性 */
	1423	g_ai_hdma.Init.Priority = 4; /* 優先度 */
	1424	g_ai_hdma.Init.SrcMaster = DMAC_MASTER1; /* ソースマスター設定 */
	1425	g_ai_hdma.Init.DstMaster = DMAC_MASTER2; /* デスティネーションマスター設定 */
	1426	g_ai_hdma.Init.SrcInc = DMAC_ADDR_NOCHANGE; /* ソースインクリメント設定 */
	1427	g_ai_hdma.Init.DstInc = DMAC_ADDR_INCREMENT; /* デスティネーションインクリメント設定 */
	1428	g_ai_hdma.Init.SrcTransWidth = DMAC_TRANS_WIDTH_32; /* ソース転送幅 */
	1429	g_ai_hdma.Init.DstTransWidth = DMAC_TRANS_WIDTH_32; /* デスティネーション転送幅 */
	1430	g_ai_hdma.Init.SrcBurstSize = DMAC_MSIZE_4; /* ソースバーストサイズ */
	1431	g_ai_hdma.Init.DstBurstSize = DMAC_MSIZE_4; /* デスティネーションバーストサイズ */
	1432	g_ai_hdma.Init.IocBlkTrans = 0; /* IOCブロック転送 */
	1433	g_ai_hdma.localdata = (void *)ctx;
	1434
	1435	return dma_init(&g_ai_hdma);
	1436	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/kpu.c@ 458

Download in other formats: