Changeset 458 for azure_iot_hub_riscv
- Timestamp:
- Sep 14, 2020, 6:36:03 PM (3 years ago)
- Location:
- azure_iot_hub_riscv/trunk
- Files:
-
- 2 added
- 30 edited
Legend:
- Unmodified
- Added
- Removed
-
azure_iot_hub_riscv/trunk/app_iothub_client/.vscode/tasks.json
r453 r458 149 149 "label": "write app_iothub_client", 150 150 "command": "/C/Python38/python.exe", 151 151 "args": [ 152 152 "../tools/kflash/kflash.py", 153 153 "-p", -
azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/atomic.h
r453 r458 22 22 23 23 #define SPINLOCK_INIT \ 24 25 26 24 { \ 25 0 \ 26 } 27 27 28 28 #define CORELOCK_INIT \ 29 30 31 32 33 29 { \ 30 .lock = SPINLOCK_INIT, \ 31 .count = 0, \ 32 .core = -1 \ 33 } 34 34 35 35 /* Defination of memory barrier macro */ 36 36 #define mb() \ 37 38 39 40 37 { \ 38 asm volatile("fence" :: \ 39 : "memory"); \ 40 } 41 41 42 42 #define atomic_set(ptr, val) (*(volatile typeof(*(ptr)) *)(ptr) = val) … … 53 53 typedef struct _spinlock 54 54 { 55 55 int lock; 56 56 } spinlock_t; 57 57 58 58 typedef struct _semaphore 59 59 { 60 61 62 60 spinlock_t lock; 61 int count; 62 int waiting; 63 63 } semaphore_t; 64 64 65 65 typedef struct _corelock 66 66 { 67 68 69 67 spinlock_t lock; 68 int count; 69 int core; 70 70 } corelock_t; 71 71 72 72 static inline int spinlock_trylock(spinlock_t *lock) 73 73 { 74 75 76 77 74 int res = atomic_swap(&lock->lock, -1); 75 /* Use memory barrier to keep coherency */ 76 mb(); 77 return res; 78 78 } 79 79 80 80 static inline void spinlock_lock(spinlock_t *lock) 81 81 { 82 83 82 while(spinlock_trylock(lock)) 83 ; 84 84 } 85 85 86 86 static inline void spinlock_unlock(spinlock_t *lock) 87 87 { 88 89 90 91 88 /* Use memory barrier to keep coherency */ 89 mb(); 90 atomic_set(&lock->lock, 0); 91 asm volatile("nop"); 92 92 } 93 93 94 94 static inline void semaphore_signal(semaphore_t *semaphore, int i) 95 95 { 96 97 98 96 spinlock_lock(&(semaphore->lock)); 97 semaphore->count += i; 98 spinlock_unlock(&(semaphore->lock)); 99 99 } 100 100 101 101 static inline void semaphore_wait(semaphore_t *semaphore, int i) 102 102 { 103 104 105 106 107 108 109 110 111 112 113 114 115 103 atomic_add(&(semaphore->waiting), 1); 104 while(1) 105 { 106 spinlock_lock(&(semaphore->lock)); 107 if(semaphore->count >= i) 108 { 109 semaphore->count -= i; 110 atomic_add(&(semaphore->waiting), -1); 111 spinlock_unlock(&(semaphore->lock)); 112 break; 113 } 114 spinlock_unlock(&(semaphore->lock)); 115 } 116 116 } 117 117 118 118 static inline int semaphore_count(semaphore_t *semaphore) 119 119 { 120 121 122 123 124 125 120 int res = 0; 121 122 spinlock_lock(&(semaphore->lock)); 123 res = semaphore->count; 124 spinlock_unlock(&(semaphore->lock)); 125 return res; 126 126 } 127 127 128 128 static inline int semaphore_waiting(semaphore_t *semaphore) 129 129 { 130 130 return atomic_read(&(semaphore->waiting)); 131 131 } 132 132 133 133 static inline int corelock_trylock(corelock_t *lock) 134 134 { 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 135 int res = 0; 136 unsigned long core; 137 138 asm volatile("csrr %0, mhartid;" 139 : "=r"(core)); 140 if(spinlock_trylock(&lock->lock)) 141 { 142 return -1; 143 } 144 145 if(lock->count == 0) 146 { 147 /* First time get lock */ 148 lock->count++; 149 lock->core = core; 150 res = 0; 151 } else if(lock->core == core) 152 { 153 /* Same core get lock */ 154 lock->count++; 155 res = 0; 156 } else 157 { 158 /* Different core get lock */ 159 res = -1; 160 } 161 spinlock_unlock(&lock->lock); 162 163 return res; 164 164 } 165 165 166 166 static inline void corelock_lock(corelock_t *lock) 167 167 { 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 168 unsigned long core; 169 170 asm volatile("csrr %0, mhartid;" 171 : "=r"(core)); 172 spinlock_lock(&lock->lock); 173 174 if(lock->count == 0) 175 { 176 /* First time get lock */ 177 lock->count++; 178 lock->core = core; 179 } else if(lock->core == core) 180 { 181 /* Same core get lock */ 182 lock->count++; 183 } else 184 { 185 /* Different core get lock */ 186 spinlock_unlock(&lock->lock); 187 188 do 189 { 190 while(atomic_read(&lock->count)) 191 ; 192 } while(corelock_trylock(lock)); 193 return; 194 } 195 spinlock_unlock(&lock->lock); 196 196 } 197 197 198 198 static inline void corelock_unlock(corelock_t *lock) 199 199 { 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 200 unsigned long core; 201 202 asm volatile("csrr %0, mhartid;" 203 : "=r"(core)); 204 spinlock_lock(&lock->lock); 205 206 if(lock->core == core) 207 { 208 /* Same core release lock */ 209 lock->count--; 210 if(lock->count <= 0) 211 { 212 lock->core = -1; 213 lock->count = 0; 214 } 215 } else 216 { 217 /* Different core release lock */ 218 spinlock_unlock(&lock->lock); 219 220 register unsigned long a7 asm("a7") = 93; 221 register unsigned long a0 asm("a0") = 0; 222 register unsigned long a1 asm("a1") = 0; 223 register unsigned long a2 asm("a2") = 0; 224 225 asm volatile("scall" 226 : "+r"(a0) 227 : "r"(a1), "r"(a2), "r"(a7)); 228 } 229 spinlock_unlock(&lock->lock); 230 230 } 231 231 -
azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/incbin.h
r453 r458 11 11 #include <limits.h> 12 12 13 # define INCBIN_ALIGNMENT_INDEX 713 # define INCBIN_ALIGNMENT_INDEX 8 14 14 15 15 /* Lookup table of (1 << n) where `n' is `INCBIN_ALIGNMENT_INDEX' */ … … 22 22 #define INCBIN_ALIGN_SHIFT_6 64 23 23 #define INCBIN_ALIGN_SHIFT_7 128 24 #define INCBIN_ALIGN_SHIFT_8 256 24 25 25 26 /* Actual alignment value */ 26 27 #define INCBIN_ALIGNMENT \ 27 28 29 28 INCBIN_CONCATENATE( \ 29 INCBIN_CONCATENATE(INCBIN_ALIGN_SHIFT, _), \ 30 INCBIN_ALIGNMENT_INDEX) 30 31 31 32 /* Stringize */ 32 33 #define INCBIN_STR(X) \ 33 34 #X 34 35 #define INCBIN_STRINGIZE(X) \ 35 36 INCBIN_STR(X) 36 37 /* Concatenate */ 37 38 #define INCBIN_CAT(X, Y) \ 38 39 X ## Y 39 40 #define INCBIN_CONCATENATE(X, Y) \ 40 41 INCBIN_CAT(X, Y) 41 42 /* Deferred macro expansion */ 42 43 #define INCBIN_EVAL(X) \ 43 44 X 44 45 #define INCBIN_INVOKE(N, ...) \ 45 46 INCBIN_EVAL(N(__VA_ARGS__)) 46 47 47 48 /* Green Hills uses a different directive for including binary data */ … … 54 55 #ifndef _MSC_VER 55 56 # define INCBIN_ALIGN \ 56 57 __attribute__((aligned(INCBIN_ALIGNMENT))) 57 58 #else 58 59 # define INCBIN_ALIGN __declspec(align(INCBIN_ALIGNMENT)) … … 60 61 61 62 #if defined(__arm__) || /* GNU C and RealView */ \ 62 63 63 defined(__arm) || /* Diab */ \ 64 defined(_ARM) /* ImageCraft */ 64 65 # define INCBIN_ARM 65 66 #endif … … 93 94 #if defined(__APPLE__) 94 95 /* The directives are different for Apple branded compilers */ 95 # define INCBIN_SECTION ". data\n"96 # define INCBIN_SECTION ".rodata\n" 96 97 # define INCBIN_GLOBAL(NAME) ".globl " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n" 97 98 # define INCBIN_INT ".long " … … 100 101 # define INCBIN_TYPE(...) 101 102 #else 102 # define INCBIN_SECTION ".section . data\n"103 # define INCBIN_SECTION ".section .rodata\n" 103 104 # define INCBIN_GLOBAL(NAME) ".global " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n" 104 105 # define INCBIN_INT ".int " … … 199 200 /* Style lookup: returning identifier */ 200 201 #define INCBIN_STYLE_IDENT(TYPE) \ 201 202 203 204 205 202 INCBIN_CONCATENATE( \ 203 INCBIN_STYLE_, \ 204 INCBIN_CONCATENATE( \ 205 INCBIN_EVAL(INCBIN_STYLE), \ 206 INCBIN_CONCATENATE(_, TYPE))) 206 207 207 208 /* Style lookup: returning string literal */ 208 209 #define INCBIN_STYLE_STRING(TYPE) \ 209 210 210 INCBIN_STRINGIZE( \ 211 INCBIN_STYLE_IDENT(TYPE)) \ 211 212 212 213 /* Generate the global labels by indirectly invoking the macro with our style 213 214 * type and concatenating the name against them. */ 214 215 #define INCBIN_GLOBAL_LABELS(NAME, TYPE) \ 215 216 217 218 219 220 221 222 223 224 225 226 227 228 216 INCBIN_INVOKE( \ 217 INCBIN_GLOBAL, \ 218 INCBIN_CONCATENATE( \ 219 NAME, \ 220 INCBIN_INVOKE( \ 221 INCBIN_STYLE_IDENT, \ 222 TYPE))) \ 223 INCBIN_INVOKE( \ 224 INCBIN_TYPE, \ 225 INCBIN_CONCATENATE( \ 226 NAME, \ 227 INCBIN_INVOKE( \ 228 INCBIN_STYLE_IDENT, \ 229 TYPE))) 229 230 230 231 /** … … 249 250 */ 250 251 #define INCBIN_EXTERN(NAME) \ 251 252 253 254 255 256 257 258 259 260 261 262 252 INCBIN_EXTERNAL INCBIN_ALIGN unsigned char \ 253 INCBIN_CONCATENATE( \ 254 INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \ 255 INCBIN_STYLE_IDENT(DATA))[]; \ 256 INCBIN_EXTERNAL INCBIN_ALIGN unsigned char * \ 257 INCBIN_CONCATENATE( \ 258 INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \ 259 INCBIN_STYLE_IDENT(END)); \ 260 INCBIN_EXTERNAL unsigned int \ 261 INCBIN_CONCATENATE( \ 262 INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \ 263 INCBIN_STYLE_IDENT(SIZE)) 263 264 264 265 /** … … 291 292 #ifdef _MSC_VER 292 293 #define INCBIN(NAME, FILENAME) \ 293 294 INCBIN_EXTERN(NAME) 294 295 #else 295 296 #define INCBIN(NAME, FILENAME) \ 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 #endif 314 #endif 297 __asm__(INCBIN_SECTION \ 298 INCBIN_GLOBAL_LABELS(NAME, DATA) \ 299 INCBIN_ALIGN_HOST \ 300 INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) ":\n" \ 301 INCBIN_MACRO " \"" FILENAME "\"\n" \ 302 INCBIN_GLOBAL_LABELS(NAME, END) \ 303 INCBIN_ALIGN_BYTE \ 304 INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) ":\n" \ 305 INCBIN_BYTE "1\n" \ 306 INCBIN_GLOBAL_LABELS(NAME, SIZE) \ 307 INCBIN_ALIGN_HOST \ 308 INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(SIZE) ":\n" \ 309 INCBIN_INT INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) " - " \ 310 INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) "\n" \ 311 ); \ 312 INCBIN_EXTERN(NAME) 313 314 #endif 315 #endif -
azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/kpu.c
r453 r458 17 17 #include "utils.h" 18 18 #include "kpu_main.h" 19 #include "kernel_cfg.h" 19 20 20 21 #define sil_orw_mem(a, b) sil_wrw_mem((a), sil_rew_mem(a) | (b)) 21 22 22 void sysctl_enable_irq(void)23 {24 set_csr(mie, MIP_MEIP);25 set_csr(mstatus, MSTATUS_MIE);26 }27 28 void sysctl_disable_irq(void)29 {30 clear_csr(mie, MIP_MEIP);31 clear_csr(mstatus, MSTATUS_MIE);32 }33 34 23 uint64_t sysctl_get_time_us(void) 35 24 { 36 37 25 uint64_t v_cycle = read_cycle(); 26 return v_cycle * 1000000 / SYSCTRL_CLOCK_FREQ_IN0; 38 27 } 39 28 40 29 static int is_memory(uintptr_t address) 41 30 { 42 43 44 45 46 47 31 enum 32 { 33 mem_len = 6 * 1024 * 1024, 34 mem_no_cache_len = 8 * 1024 * 1024, 35 }; 36 return ((address >= 0x80000000) && (address < 0x80000000 + mem_len)) || ((address >= 0x40000000) && (address < 0x40000000 + mem_no_cache_len)) || (address == 0x50450040); 48 37 } 49 38 50 39 uint32_t is_memory_cache(uintptr_t address) 51 40 { 52 53 54 41 #define MEM_CACHE_LEN (6 * 1024 * 1024) 42 43 return ((address >= 0x80000000) && (address < 0x80000000 + MEM_CACHE_LEN)); 55 44 } 56 45 57 46 int plic_irq_enable(INTNO irq_number) 58 47 { 59 60 61 62 48 if (irq_number != INTNO_AI) 49 return -1; 50 ena_int(irq_number); 51 return 0; 63 52 } 64 53 65 54 int plic_set_priority(INTNO irq_number, uint32_t priority) 66 55 { 67 68 69 70 56 if (irq_number != INTNO_AI) 57 return -1; 58 set_ipriority(irq_number, priority); 59 return 0; 71 60 } 72 61 … … 76 65 void plic_irq_register(INTNO irq, plic_irq_callback_t callback, void *ctx) 77 66 { 78 ER ret; 79 if (irq != INTNO_AI) 80 return; 81 82 ret = loc_cpu(); 83 84 ai_done_callback = callback; 85 ai_done_ctx = ctx; 86 87 if (ret == E_OK) 88 unl_cpu(); 67 if (irq != INTNO_AI) 68 return; 69 70 dis_int(INTNO_AI); 71 72 ai_done_callback = callback; 73 ai_done_ctx = ctx; 74 75 ena_int(INTNO_AI); 89 76 } 90 77 91 78 void ai_done_isr(intptr_t exinf) 92 79 { 93 sysctl_disable_irq(); 94 if (ai_done_callback != NULL){ 95 ai_done_callback(ai_done_ctx); 96 } 97 sysctl_enable_irq(); 80 dis_int(INTNO_AI); 81 if (ai_done_callback != NULL) 82 { 83 ai_done_callback(ai_done_ctx); 84 } 85 ena_int(INTNO_AI); 98 86 } 99 87 … … 102 90 103 91 void kpu_dmac_irq_register(dmac_channel_number_t channel_num, 104 plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority) 105 { 106 ER ret; 107 if (channel_num != AI_DMA_CH) 108 return; 109 110 set_ipriority(INTNO_DMAAI, priority); 111 112 ret = loc_cpu(); 113 114 ai_dma_done_callback = dmac_callback; 115 ai_dma_done_ctx = ctx; 116 117 if (ret == E_OK) 118 unl_cpu(); 92 plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority) 93 { 94 if (channel_num != AI_DMA_CH) 95 return; 96 97 //set_ipriority(INTNO_DMAAI, priority); 98 99 dis_int(INTNO_DMAAI); 100 101 ai_dma_done_callback = dmac_callback; 102 ai_dma_done_ctx = ctx; 103 104 ena_int(INTNO_DMAAI); 119 105 } 120 106 121 107 void ai_dma_done_isr(DMA_Handle_t *dma) 122 108 { 123 sysctl_disable_irq(); 124 if (ai_dma_done_callback != NULL) { 125 ai_dma_done_callback(ai_dma_done_ctx); 126 } 127 sysctl_enable_irq(); 109 dis_int(INTNO_DMAAI); 110 111 if (ai_dma_done_callback != NULL) 112 { 113 ai_dma_done_callback(ai_dma_done_ctx); 114 } 115 116 ena_int(INTNO_DMAAI); 128 117 } 129 118 130 119 void dmac_set_irq(dmac_channel_number_t channel_num, 131 plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority) 132 { 133 ER ret; 134 if (channel_num != AI_DMA_CH) 135 return; 136 137 set_ipriority(INTNO_DMAAI, priority); 138 139 ret = loc_cpu(); 140 141 ai_dma_done_callback = dmac_callback; 142 ai_dma_done_ctx = ctx; 143 144 if (ret == E_OK) 145 unl_cpu(); 120 plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority) 121 { 122 if (channel_num != AI_DMA_CH) 123 return; 124 125 //set_ipriority(INTNO_DMAAI, priority); 126 127 dis_int(INTNO_DMAAI); 128 129 ai_dma_done_callback = dmac_callback; 130 ai_dma_done_ctx = ctx; 131 132 ena_int(INTNO_DMAAI); 146 133 } 147 134 … … 149 136 150 137 void dmac_set_single_mode(dmac_channel_number_t channel_num, 151 152 153 154 155 156 { 157 158 159 160 161 162 163 if(mem_type_src == 0 && mem_type_dest == 0)164 165 else if(mem_type_src == 1 && mem_type_dest == 0)166 167 else if(mem_type_src == 0 && mem_type_dest == 1)168 169 170 171 172 hdma->Init.Direction = flow_control;/* DMA転送方向 */173 hdma->Init.SrcHandShake = (mem_type_src ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE);/* ソースハンドシェイク */174 hdma->Init.DrcHandShake = (mem_type_dest ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE);/* デスティネーションハンドシェイク */175 hdma->Init.SrcInc = src_inc;/* ソースインクリメント設定 */176 hdma->Init.DstInc = dest_inc;/* デスティネーションインクリメント設定 */177 hdma->Init.SrcTransWidth = dmac_trans_width;/* ソース転送幅 */178 hdma->Init.DstTransWidth = dmac_trans_width;/* デスティネーション転送幅 */179 hdma->Init.SrcBurstSize = dmac_burst_size;/* ソースバーストサイズ */180 hdma->Init.DstBurstSize = dmac_burst_size;/* デスティネーションバーストサイズ */181 182 183 138 const void *src, void *dest, uint8_t src_inc, 139 uint8_t dest_inc, 140 uint8_t dmac_burst_size, 141 uint8_t dmac_trans_width, 142 size_t block_size) 143 { 144 if (channel_num != AI_DMA_CH) 145 return; 146 147 DMA_Handle_t *hdma = &g_ai_hdma; 148 int mem_type_src = is_memory((uintptr_t)src), mem_type_dest = is_memory((uintptr_t)dest); 149 uint8_t flow_control; 150 if (mem_type_src == 0 && mem_type_dest == 0) 151 flow_control = DMA_PERIPH_TO_PERIPH; 152 else if (mem_type_src == 1 && mem_type_dest == 0) 153 flow_control = DMA_MEMORY_TO_PERIPH; 154 else if (mem_type_src == 0 && mem_type_dest == 1) 155 flow_control = DMA_PERIPH_TO_MEMORY; 156 else 157 flow_control = DMA_MEMORY_TO_MEMORY; 158 159 hdma->Init.Direction = flow_control; /* DMA転送方向 */ 160 hdma->Init.SrcHandShake = (mem_type_src ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* ソースハンドシェイク */ 161 hdma->Init.DrcHandShake = (mem_type_dest ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* デスティネーションハンドシェイク */ 162 hdma->Init.SrcInc = src_inc; /* ソースインクリメント設定 */ 163 hdma->Init.DstInc = dest_inc; /* デスティネーションインクリメント設定 */ 164 hdma->Init.SrcTransWidth = dmac_trans_width; /* ソース転送幅 */ 165 hdma->Init.DstTransWidth = dmac_trans_width; /* デスティネーション転送幅 */ 166 hdma->Init.SrcBurstSize = dmac_burst_size; /* ソースバーストサイズ */ 167 hdma->Init.DstBurstSize = dmac_burst_size; /* デスティネーションバーストサイズ */ 168 dma_reset(hdma); 169 170 dma_start(hdma, (uintptr_t)src, (uintptr_t)dest, block_size); 184 171 } 185 172 … … 199 186 static volatile uint32_t kpu_status; 200 187 201 typedef struct kpu_context202 {203 kpu_task_t kpu_task;204 uint32_t kpu_status;205 } kpu_context_t;206 207 volatile kpu_context_t g_kpu_context;208 209 static int kpu_run_all_done(void *_task)210 {211 atomic_swap(&g_kpu_context.kpu_status, 0);212 kpu_task_t *task = (kpu_task_t *)_task;213 task->callback(task);214 return 0;215 }216 217 int kpu_continue(void *_task)218 {219 kpu_task_t *task = (kpu_task_t *)_task;220 int layer_burst_size = 1;221 222 kpu->interrupt_clear.data = (kpu_config_interrupt_t){223 .calc_done_int = 1,224 .layer_cfg_almost_empty_int = 1,225 .layer_cfg_almost_full_int = 1};226 227 if(task->remain_layers_length == 0)228 {229 return 0;230 }231 if(task->remain_layers_length <= layer_burst_size)232 {233 for(uint32_t i = 0; i < task->remain_layers_length; i++)234 {235 kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;236 kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;237 kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;238 kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;239 kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;240 kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;241 kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;242 kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;243 kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;244 kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;245 kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;246 kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;247 }248 task->remain_layers_length = 0;249 } else250 {251 for(uint32_t i = 0; i < layer_burst_size; i++)252 {253 kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;254 kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;255 kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;256 kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;257 kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;258 kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;259 kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;260 kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;261 kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;262 kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;263 kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;264 kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;265 }266 task->remain_layers += layer_burst_size;267 task->remain_layers_length -= layer_burst_size;268 }269 return 0;270 }271 272 static int kpu_run_dma_output(uint32_t dma_ch, void *dst, uint32_t length, plic_irq_callback_t cb, void *_task)273 {274 select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);275 kpu_dmac_irq_register(dma_ch, kpu_run_all_done, _task, 1);276 dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), (void *)(dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,277 DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (length + 7) / 8);278 return 0;279 }280 281 static int kpu_run_dma_input_done_push_layers(void *_task)282 {283 kpu_task_t *task = (kpu_task_t *)_task;284 kpu->interrupt_clear.reg = 7;285 dma_end(&g_ai_hdma);286 kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){287 .fifo_full_threshold = 10, .fifo_empty_threshold = 1};288 kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){289 .eight_bit_mode = task->eight_bit_mode};290 291 kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];292 293 kpu_run_dma_output(task->dma_ch, task->dst, last_layer->dma_parameter.data.dma_total_byte + 1, kpu_run_all_done, task);294 295 kpu->interrupt_mask.data = (kpu_config_interrupt_t){296 .calc_done_int = 0,297 .layer_cfg_almost_empty_int = 0,298 .layer_cfg_almost_full_int = 1};299 kpu_continue(task);300 return 0;301 }302 303 static void kpu_run_dma_input(uint32_t dma_ch, const void *src, plic_irq_callback_t cb, void *_task)304 {305 kpu_task_t *task = _task;306 kpu_layer_argument_t *first_layer = &task->layers[0];307 uint64_t input_size = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1);308 kpu_dmac_irq_register(dma_ch, cb, _task, 1);309 dmac_set_single_mode(dma_ch, (void *)src, (void *)(AI_IO_BASE_ADDR), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,310 DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);311 }312 313 int kpu_run(kpu_task_t *v_task, dmac_channel_number_t dma_ch, const void *src, void *dest, plic_irq_callback_t callback)314 {315 if(atomic_cas(&g_kpu_context.kpu_status, 0, 1))316 return -1;317 318 memcpy((void *)&g_kpu_context.kpu_task, v_task, sizeof(kpu_task_t));319 kpu_task_t *task = (kpu_task_t *)&g_kpu_context.kpu_task;320 321 kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];322 323 uint64_t output_size = last_layer->dma_parameter.data.dma_total_byte + 1;324 325 last_layer->dma_parameter.data.send_data_out = 1;326 last_layer->interrupt_enabe.data.int_en = 1;327 328 task->dma_ch = dma_ch;329 task->dst = dest;330 task->dst_length = output_size;331 task->callback = callback;332 task->remain_layers_length = task->layers_length;333 task->remain_layers = task->layers;334 335 plic_set_priority(INTNO_AI, 1);336 plic_irq_register(INTNO_AI, kpu_continue, task);337 plic_irq_enable(INTNO_AI);338 339 kpu_run_dma_input(dma_ch, src, kpu_run_dma_input_done_push_layers, task);340 341 return 0;342 }343 344 uint8_t *kpu_get_output_buf(kpu_task_t *task)345 {346 kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];347 size_t output_size = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8 * 8;348 return malloc(output_size);349 }350 351 void kpu_release_output_buf(uint8_t *output_buf)352 {353 if(output_buf != NULL)354 free(output_buf);355 }356 357 static int kpu_done(void *ctx)358 {359 atomic_swap(&kpu_status, 0);360 kpu_task_t *task = (kpu_task_t *)ctx;361 task->callback(task->ctx);362 return 0;363 }364 365 static int kpu_config_input(void *ctx)366 {367 kpu_task_t *task = (kpu_task_t *)ctx;368 kpu->interrupt_clear.reg = 7;369 if(task->remain_layers_length <= LAYER_BURST_SIZE)370 {371 for(uint32_t i = 0; i < task->remain_layers_length; i++)372 {373 kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;374 kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;375 kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;376 kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;377 kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;378 kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;379 kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;380 kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;381 kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;382 kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;383 kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;384 kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;385 }386 task->remain_layers_length = 0;387 kpu->interrupt_mask.reg = 7;388 } else389 {390 for(uint32_t i = 0; i < LAYER_BURST_SIZE; i++)391 {392 kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;393 kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;394 kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;395 kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;396 kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;397 kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;398 kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;399 kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;400 kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;401 kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;402 kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;403 kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;404 }405 task->remain_layers += LAYER_BURST_SIZE;406 task->remain_layers_length -= LAYER_BURST_SIZE;407 }408 return 0;409 }410 411 static void kpu_data_output(kpu_task_t *task)412 {413 select_dma_channel(task->dma_ch, DMA_SELECT_AI_RX_REQ);414 kpu_dmac_irq_register(task->dma_ch, kpu_done, task, 1);415 dmac_set_single_mode(task->dma_ch, (void *)(&kpu->fifo_data_out), (void *)(task->dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,416 DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, task->dst_length);417 }418 419 static int kpu_data_ready(void *ctx)420 {421 kpu_task_t *task = (kpu_task_t *)ctx;422 423 dma_end(&g_ai_hdma);424 kpu_data_output(task);425 426 kpu->eight_bit_mode.reg = task->eight_bit_mode;427 kpu->interrupt_mask.reg = 7;428 kpu->interrupt_clear.reg = 7;429 kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){430 .fifo_full_threshold = 12, .fifo_empty_threshold = 1};431 432 plic_set_priority(INTNO_AI, 2);433 plic_irq_register(INTNO_AI, kpu_config_input, task);434 plic_irq_enable(INTNO_AI);435 kpu_config_input(task);436 kpu->interrupt_mask.data = (kpu_config_interrupt_t){437 .calc_done_int = 1,438 .layer_cfg_almost_empty_int = 0,439 .layer_cfg_almost_full_int = 1};440 return 0;441 }442 443 static void kpu_data_input(kpu_task_t *task)444 {445 if(task->src == NULL)446 {447 kpu_data_ready(task);448 return;449 }450 kpu_dmac_irq_register(task->dma_ch, kpu_data_ready, task, 1);451 kpu_layer_argument_t *layer = &task->layers[0];452 dmac_set_single_mode(task->dma_ch, (void *)(uintptr_t)task->src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,453 DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, task->src_length);454 }455 456 int kpu_single_task_init(kpu_task_t *task)457 {458 /*459 * AIクロック有効化460 */461 sil_orw_mem((uint32_t *)(TADR_SYSCTL_BASE+TOFF_SYSCTL_CLK_EN_PERI), SYSCTL_CLK_EN_PERI_AI_CLK_EN);462 463 kpu_layer_argument_t *first_layer = &task->layers[0];464 kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];465 466 last_layer->dma_parameter.data.send_data_out = 1;467 last_layer->interrupt_enabe.data.int_en = 1;468 task->src_length = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1) / 8;469 task->dst_length = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8;470 task->dst = (uint64_t *)malloc(task->dst_length * 8);471 if(task->dst == NULL)472 return 1;473 memset(task->dst, 0, task->dst_length * 8);474 return 0;475 }476 477 int kpu_single_task_deinit(kpu_task_t *task)478 {479 free(task->dst);480 return 0;481 }482 483 int kpu_model_load_from_buffer(kpu_task_t *task, uint8_t *buffer, kpu_model_layer_metadata_t **meta)484 {485 uintptr_t base_addr = (uintptr_t)buffer;486 kpu_model_header_t *header = (kpu_model_header_t *)buffer;487 kpu_model_layer_metadata_t *layer_meta = (kpu_model_layer_metadata_t *)(base_addr + sizeof(kpu_model_header_t));488 kpu_layer_argument_t *layers = (kpu_layer_argument_t *)(base_addr + header->layers_argument_start);489 490 if(header->version != 1)491 return -1;492 uint32_t layers_length = header->layers_length;493 task->layers_length = layers_length;494 task->eight_bit_mode = header->flags & 1;495 task->layers = layers;496 task->output_scale = layer_meta[layers_length - 1].output_scale;497 task->output_bias = layer_meta[layers_length - 1].output_bias;498 size_t i;499 for(i = 0; i < layers_length; i++)500 {501 layers[i].kernel_load_cfg.data.para_start_addr = (uint64_t)(base_addr + layer_meta[i].weigths_offset);502 layers[i].kernel_pool_type_cfg.data.bwsx_base_addr = (uint64_t)(base_addr + layer_meta[i].bn_offset);503 layers[i].kernel_calc_type_cfg.data.active_addr = (uint64_t)(base_addr + layer_meta[i].act_offset);504 }505 506 if(meta)507 *meta = layer_meta;508 return 0;509 }510 511 int kpu_start(kpu_task_t *task)512 {513 if(atomic_cas(&kpu_status, 0, 1))514 return -1;515 516 task->remain_layers_length = task->layers_length;517 task->remain_layers = task->layers;518 kpu_data_input(task);519 return 0;520 }521 522 188 static void kpu_send_layer(const kpu_layer_argument_t *layer) 523 189 { 524 kpu->layer_argument_fifo = layer->interrupt_enabe.reg; 525 kpu->layer_argument_fifo = layer->image_addr.reg; 526 kpu->layer_argument_fifo = layer->image_channel_num.reg; 527 kpu->layer_argument_fifo = layer->image_size.reg; 528 kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg; 529 kpu->layer_argument_fifo = layer->kernel_load_cfg.reg; 530 kpu->layer_argument_fifo = layer->kernel_offset.reg; 531 kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg; 532 kpu->layer_argument_fifo = layer->write_back_cfg.reg; 533 kpu->layer_argument_fifo = layer->conv_value.reg; 534 kpu->layer_argument_fifo = layer->conv_value2.reg; 535 kpu->layer_argument_fifo = layer->dma_parameter.reg; 536 } 537 538 void kpu_init(int eight_bit_mode, plic_irq_callback_t callback, void *userdata) 539 { 540 kpu->interrupt_clear.reg = 7; 541 kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){ 542 .fifo_full_threshold = 10, .fifo_empty_threshold = 1}; 543 kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){ 544 .eight_bit_mode = eight_bit_mode}; 545 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 546 .calc_done_int = 1, 547 .layer_cfg_almost_empty_int = 0, 548 .layer_cfg_almost_full_int = 1}; 549 550 plic_set_priority(INTNO_AI, 1); 551 plic_irq_register(INTNO_AI, callback, userdata); 552 plic_irq_enable(INTNO_AI); 190 kpu->layer_argument_fifo = layer->interrupt_enabe.reg; 191 kpu->layer_argument_fifo = layer->image_addr.reg; 192 kpu->layer_argument_fifo = layer->image_channel_num.reg; 193 kpu->layer_argument_fifo = layer->image_size.reg; 194 kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg; 195 kpu->layer_argument_fifo = layer->kernel_load_cfg.reg; 196 kpu->layer_argument_fifo = layer->kernel_offset.reg; 197 kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg; 198 kpu->layer_argument_fifo = layer->write_back_cfg.reg; 199 kpu->layer_argument_fifo = layer->conv_value.reg; 200 kpu->layer_argument_fifo = layer->conv_value2.reg; 201 kpu->layer_argument_fifo = layer->dma_parameter.reg; 553 202 } 554 203 555 204 void kpu_input_dma(const kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata) 556 205 { 557 558 559 560 206 uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1); 207 dmac_set_irq(dma_ch, callback, userdata, 1); 208 dmac_set_single_mode(dma_ch, (void *)src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT, 209 DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8); 561 210 } 562 211 563 212 static void kpu_conv2d_core(kpu_layer_argument_t *layer) 564 213 { 565 214 kpu_send_layer(layer); 566 215 } 567 216 568 217 void kpu_conv2d(kpu_layer_argument_t *layer) 569 218 { 570 kpu->interrupt_clear.data = (kpu_config_interrupt_t){ 571 .calc_done_int = 1, 572 .layer_cfg_almost_empty_int = 1, 573 .layer_cfg_almost_full_int = 1}; 574 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 575 .calc_done_int = 1, 576 .layer_cfg_almost_empty_int = 0, 577 .layer_cfg_almost_full_int = 1}; 578 kpu_conv2d_core(layer); 579 } 580 581 void kpu_conv2d_output(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata) 582 { 583 kpu->interrupt_clear.data = (kpu_config_interrupt_t){ 584 .calc_done_int = 1, 585 .layer_cfg_almost_empty_int = 1, 586 .layer_cfg_almost_full_int = 1}; 587 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 588 .calc_done_int = 1, 589 .layer_cfg_almost_empty_int = 1, 590 .layer_cfg_almost_full_int = 1}; 591 layer->dma_parameter.data.send_data_out = 1; 592 select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ); 593 dmac_set_irq(dma_ch, callback, userdata, 1); 594 dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT, 595 DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer->dma_parameter.data.dma_total_byte + 8) / 8); 596 kpu_conv2d_core(layer); 597 } 598 599 void kpu_conv2d_output_full_add(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint64_t *dest, plic_irq_callback_t callback, void *userdata) 600 { 601 uint32_t channels = layer->image_channel_num.data.o_ch_num + 1; 602 layer->interrupt_enabe.data.full_add = 1; 603 604 kpu->interrupt_clear.data = (kpu_config_interrupt_t){ 605 .calc_done_int = 1, 606 .layer_cfg_almost_empty_int = 1, 607 .layer_cfg_almost_full_int = 1}; 608 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 609 .calc_done_int = 1, 610 .layer_cfg_almost_empty_int = 1, 611 .layer_cfg_almost_full_int = 1}; 612 layer->dma_parameter.data.send_data_out = 1; 613 select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ); 614 dmac_set_irq(dma_ch, callback, userdata, 1); 615 dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT, 616 DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, channels); 617 kpu_conv2d_core(layer); 618 } 619 620 void kpu_add(const uint8_t *src1, const quantize_param_t *src1_param, const uint8_t *src2, const quantize_param_t *src2_param, size_t count, uint8_t *dest, const quantize_param_t *dest_param) 621 { 622 quantize_param_t q1 = *src1_param, q2 = *src2_param, q3 = *dest_param; 623 624 size_t i; 625 for(i = 0; i < count; i++) 626 { 627 int value = ((*src1++ * q1.scale + q1.bias + *src2++ * q2.scale + q2.bias) - q3.bias) / q3.scale; 628 if(value < 0) 629 value = 0; 630 if(value > 0xFF) 631 value = 0xFF; 632 *dest++ = value; 633 } 219 kpu->interrupt_clear.data = (kpu_config_interrupt_t){ 220 .calc_done_int = 1, 221 .layer_cfg_almost_empty_int = 1, 222 .layer_cfg_almost_full_int = 1}; 223 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 224 .calc_done_int = 1, 225 .layer_cfg_almost_empty_int = 0, 226 .layer_cfg_almost_full_int = 1}; 227 kpu_conv2d_core(layer); 634 228 } 635 229 636 230 void kpu_global_average_pool(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, uint8_t *dest, const quantize_param_t *dest_param) 637 231 { 638 quantize_param_t q1 = *src_param, q2 = *dest_param; 639 size_t oc, y, x; 640 641 if(((uintptr_t)dest) >= AI_IO_BASE_ADDR && ((uintptr_t)dest) < AI_IO_BASE_ADDR + 2 * 1024 * 1024) 642 { 643 uint32_t row_padding = 16; 644 uint32_t row_group = 4; 645 uint32_t row_length = 1; 646 uint32_t height = 4; 647 648 for(oc = 0; oc < channels; oc++) 649 { 650 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; 651 for(y = 0; y < 1; y++) 652 { 653 uint8_t *y_origin = channel_origin + y * row_length * 64; 654 for(x = 0; x < 1; x++) 655 { 656 int64_t sum = 0; 657 size_t i; 658 for(i = 0; i < kernel_size; i++) 659 sum += *src++; 660 661 int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale; 662 if(value < 0) 663 value = 0; 664 if(value > 0xFF) 665 value = 0xFF; 666 y_origin[x] = value; 667 } 668 } 669 } 670 } else 671 { 672 for(oc = 0; oc < channels; oc++) 673 { 674 int64_t sum = 0; 675 size_t i; 676 for(i = 0; i < kernel_size; i++) 677 sum += *src++; 678 679 int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale; 680 if(value < 0) 681 value = 0; 682 if(value > 0xFF) 683 value = 0xFF; 684 dest[oc] = value; 685 } 686 } 232 quantize_param_t q1 = *src_param, q2 = *dest_param; 233 size_t oc, y, x; 234 235 if (((uintptr_t)dest) >= AI_IO_BASE_ADDR && ((uintptr_t)dest) < AI_IO_BASE_ADDR + 2 * 1024 * 1024) 236 { 237 uint32_t row_padding = 16; 238 uint32_t row_group = 4; 239 uint32_t row_length = 1; 240 uint32_t height = 4; 241 242 for (oc = 0; oc < channels; oc++) 243 { 244 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; 245 for (y = 0; y < 1; y++) 246 { 247 uint8_t *y_origin = channel_origin + y * row_length * 64; 248 for (x = 0; x < 1; x++) 249 { 250 int64_t sum = 0; 251 size_t i; 252 for (i = 0; i < kernel_size; i++) 253 sum += *src++; 254 255 int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale; 256 if (value < 0) 257 value = 0; 258 if (value > 0xFF) 259 value = 0xFF; 260 y_origin[x] = value; 261 } 262 } 263 } 264 } 265 else 266 { 267 for (oc = 0; oc < channels; oc++) 268 { 269 int64_t sum = 0; 270 size_t i; 271 for (i = 0; i < kernel_size; i++) 272 sum += *src++; 273 274 int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale; 275 if (value < 0) 276 value = 0; 277 if (value > 0xFF) 278 value = 0xFF; 279 dest[oc] = value; 280 } 281 } 687 282 } 688 283 689 284 void kpu_global_average_pool_float(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, float *dest) 690 285 { 691 quantize_param_t q = *src_param; 692 size_t oc; 693 694 for(oc = 0; oc < channels; oc++) 695 { 696 int64_t sum = 0; 697 size_t i; 698 for(i = 0; i < kernel_size; i++) 699 sum += *src++; 700 701 float value = (sum * q.scale + q.bias) / kernel_size; 702 dest[oc] = value; 703 } 704 } 705 706 void kpu_matmul_end(const uint8_t *src, int channels, float *dest, const quantize_param_t *dest_param) 707 { 708 quantize_param_t q1 = *dest_param; 709 size_t i = 0; 710 for(i = 0; i < channels; i++) 711 *dest++ = src[i * 16] * q1.scale + q1.bias; 712 } 713 714 void kpu_fully_connected(const float *src, const float *weights, const float *biases, float *dest, int input_channels, int output_channels) 715 { 716 int ic, oc; 717 for(oc = 0; oc < output_channels; oc++) 718 { 719 const float *c_weights = weights + oc * input_channels; 720 721 float sum = 0.0f; 722 for(ic = 0; ic < input_channels; ic++) 723 sum += src[ic] * c_weights[ic]; 724 dest[oc] = sum + biases[oc]; 725 } 726 } 727 728 void kpu_dequantize(const uint8_t *src, const quantize_param_t *src_param, size_t count, float *dest) 729 { 730 quantize_param_t q1 = *src_param; 731 size_t i = 0; 732 for(i = 0; i < count; i++) 733 *dest++ = src[i] * q1.scale + q1.bias; 734 } 735 736 void kpu_input_with_padding(kpu_layer_argument_t *layer, const uint8_t *src, int width, int height, int channels) 737 { 738 uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64); 739 size_t oc, y, x; 740 741 uint32_t row_padding; 742 uint32_t row_group; 743 uint32_t row_length; 744 745 if(width <= 16) 746 { 747 row_padding = 16; 748 row_group = 4; 749 row_length = 1; 750 } else if(width <= 32) 751 { 752 row_padding = 32; 753 row_group = 2; 754 row_length = 1; 755 } else 756 { 757 row_padding = 64; 758 row_group = 1; 759 row_length = (width + 63) / 64; 760 } 761 762 for(oc = 0; oc < channels; oc++) 763 { 764 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; 765 for(y = 0; y < height; y++) 766 { 767 uint8_t *y_origin = channel_origin + y * row_length * 64; 768 for(x = 0; x < width; x++) 769 y_origin[x] = *src++; 770 } 771 } 772 } 286 quantize_param_t q = *src_param; 287 size_t oc; 288 289 for (oc = 0; oc < channels; oc++) 290 { 291 int64_t sum = 0; 292 size_t i; 293 for (i = 0; i < kernel_size; i++) 294 sum += *src++; 295 296 float value = (sum * q.scale + q.bias) / kernel_size; 297 dest[oc] = value; 298 } 299 } 300 773 301 #if USE_CACHED_AI_RAM 774 302 static void kpu_flush_cache(uint32_t addr, size_t lines) 775 303 { 776 777 for(line = 0; line < lines; line++)778 779 780 781 782 for(i = 0; i < 8; i++)783 784 304 size_t line; 305 for (line = 0; line < lines; line++) 306 { 307 const uint64_t *src = (const uint64_t *)(AI_RAM_BASE_ADDR + (addr + line) * 64); 308 uint64_t *dest = (uint64_t *)(AI_IO_BASE_ADDR + (addr + line) * 64); 309 size_t i; 310 for (i = 0; i < 8; i++) 311 dest[i] = src[i]; 312 } 785 313 } 786 314 #endif 787 315 static int64_t kpu_carry_shift(int64_t value, uint32_t shift) 788 316 { 789 if(shift > 0) 790 { 791 value >>= shift - 1; 792 if(value & 0x1) 793 { 794 if(value < 0) 795 value = (value >> 1) - 1; 796 else 797 value = (value >> 1) + 1; 798 } else 799 { 800 value >>= 1; 801 } 802 } 803 804 return value; 317 if (shift > 0) 318 { 319 value >>= shift - 1; 320 if (value & 0x1) 321 { 322 if (value < 0) 323 value = (value >> 1) - 1; 324 else 325 value = (value >> 1) + 1; 326 } 327 else 328 { 329 value >>= 1; 330 } 331 } 332 333 return value; 805 334 } 806 335 static void kpu_upload_core(size_t width, size_t height, size_t channels, const uint8_t *src, uint32_t kpu_addr) 807 336 { 808 uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + kpu_addr * 64); 809 size_t oc, y, x; 810 uint32_t row_padding; 811 uint32_t row_group; 812 uint32_t row_length; 813 if(width <= 16) 814 { 815 row_padding = 16; 816 row_group = 4; 817 row_length = 1; 818 } else if(width <= 32) 819 { 820 row_padding = 32; 821 row_group = 2; 822 row_length = 1; 823 } else 824 { 825 row_padding = 64; 826 row_group = 1; 827 row_length = (width + 63) / 64; 828 } 829 830 if((uintptr_t)src % 8 == 0 && width % 8 == 0) 831 { 337 uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + kpu_addr * 64); 338 size_t oc, y, x; 339 uint32_t row_padding; 340 uint32_t row_group; 341 uint32_t row_length; 342 if (width <= 16) 343 { 344 row_padding = 16; 345 row_group = 4; 346 row_length = 1; 347 } 348 else if (width <= 32) 349 { 350 row_padding = 32; 351 row_group = 2; 352 row_length = 1; 353 } 354 else 355 { 356 row_padding = 64; 357 row_group = 1; 358 row_length = (width + 63) / 64; 359 } 360 361 if ((uintptr_t)src % 8 == 0 && width % 8 == 0) 362 { 832 363 #define UPLOAD_BEGIN() \ 833 for(oc = 0; oc < channels; oc++)\834 835 836 for(y = 0; y < height; y++)\837 838 364 for (oc = 0; oc < channels; oc++) \ 365 { \ 366 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; \ 367 for (y = 0; y < height; y++) \ 368 { \ 369 uint64_t *y_origin = (uint64_t *)(channel_origin + y * row_length * 64); 839 370 840 371 #define UPLOAD_END() \ 841 } \ 842 } 843 844 width /= 8; 845 const uint64_t *u64_src = (const uint64_t *)src; 846 if(width == 1) 847 { 848 UPLOAD_BEGIN() 849 y_origin[0] = *u64_src++; 850 UPLOAD_END() 851 } else if(width == 2) 852 { 853 UPLOAD_BEGIN() 854 { 855 y_origin[0] = *u64_src++; 856 y_origin[1] = *u64_src++; 857 } 858 UPLOAD_END() 859 } else if(width == 4) 860 { 861 UPLOAD_BEGIN() 862 { 863 y_origin[0] = *u64_src++; 864 y_origin[1] = *u64_src++; 865 y_origin[2] = *u64_src++; 866 y_origin[3] = *u64_src++; 867 } 868 UPLOAD_END() 869 } else 870 { 871 UPLOAD_BEGIN() 872 for(x = 0; x < width; x++) 873 y_origin[x] = *u64_src++; 874 UPLOAD_END() 875 } 876 } else 877 { 878 for(oc = 0; oc < channels; oc++) 879 { 880 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; 881 for(y = 0; y < height; y++) 882 { 883 uint8_t *y_origin = channel_origin + y * row_length * 64; 884 for(x = 0; x < width; x++) 885 y_origin[x] = *src++; 886 } 887 } 888 } 372 } \ 373 } 374 375 width /= 8; 376 const uint64_t *u64_src = (const uint64_t *)src; 377 if (width == 1) 378 { 379 UPLOAD_BEGIN() 380 y_origin[0] = *u64_src++; 381 UPLOAD_END() 382 } 383 else if (width == 2) 384 { 385 UPLOAD_BEGIN() 386 { 387 y_origin[0] = *u64_src++; 388 y_origin[1] = *u64_src++; 389 } 390 UPLOAD_END() 391 } 392 else if (width == 4) 393 { 394 UPLOAD_BEGIN() 395 { 396 y_origin[0] = *u64_src++; 397 y_origin[1] = *u64_src++; 398 y_origin[2] = *u64_src++; 399 y_origin[3] = *u64_src++; 400 } 401 UPLOAD_END() 402 } 403 else 404 { 405 UPLOAD_BEGIN() 406 for (x = 0; x < width; x++) 407 y_origin[x] = *u64_src++; 408 UPLOAD_END() 409 } 410 } 411 else 412 { 413 for (oc = 0; oc < channels; oc++) 414 { 415 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; 416 for (y = 0; y < height; y++) 417 { 418 uint8_t *y_origin = channel_origin + y * row_length * 64; 419 for (x = 0; x < width; x++) 420 y_origin[x] = *src++; 421 } 422 } 423 } 889 424 } 890 425 static void kpu_kmodel_input_with_padding(const kpu_layer_argument_t *layer, const uint8_t *src) 891 426 { 892 893 894 895 896 427 size_t width = layer->image_size.data.i_row_wid + 1; 428 size_t height = layer->image_size.data.i_col_high + 1; 429 size_t channels = layer->image_channel_num.data.i_ch_num + 1; 430 431 kpu_upload_core(width, height, channels, src, layer->image_addr.data.image_src_addr); 897 432 } 898 433 899 434 static void kpu_kmodel_input_float(const float *src, float *dest, size_t count) 900 435 { 901 436 memcpy(dest, src, count * sizeof(float)); 902 437 } 903 438 904 439 static void kpu_float_activation(float *data, size_t count, kpu_model_activation_t act) 905 440 { 906 size_t i; 907 908 if(act == KLA_RELU) 909 { 910 for(i = 0; i < count; i++) 911 data[i] = max(data[i], 0); 912 } else if(act == KLA_RELU6) 913 { 914 for(i = 0; i < count; i++) 915 data[i] = min(max(data[i], 0), 6); 916 } 441 size_t i; 442 443 if (act == KLA_RELU) 444 { 445 for (i = 0; i < count; i++) 446 data[i] = max(data[i], 0); 447 } 448 else if (act == KLA_RELU6) 449 { 450 for (i = 0; i < count; i++) 451 data[i] = min(max(data[i], 0), 6); 452 } 917 453 } 918 454 919 455 static void kpu_kmodel_add(const kpu_model_add_layer_argument_t *arg, kpu_model_context_t *ctx) 920 456 { 921 922 923 924 925 926 for(i = 0; i < count; i++)927 457 const float *src_a = (const float *)(ctx->main_buffer + arg->main_mem_in_a_address); 458 const float *src_b = (const float *)(ctx->main_buffer + arg->main_mem_in_b_address); 459 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 460 size_t i, count = arg->count; 461 462 for (i = 0; i < count; i++) 463 dest[i] = src_a[i] + src_b[i]; 928 464 } 929 465 930 466 static void kpu_quantized_add(const kpu_model_quant_add_layer_argument_t *arg, kpu_model_context_t *ctx) 931 467 { 932 933 934 935 936 937 938 939 940 941 942 if(sh_a == sh_b)943 468 const uint8_t *src_a = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_a_address); 469 const uint8_t *src_b = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_b_address); 470 size_t count = ALIGN_UP(arg->count, 8) / 8; 471 int64_t off_a = arg->in_a_offset, mul_a = arg->in_a_mul, sh_a = arg->in_a_shift; 472 int64_t off_b = arg->in_b_offset, mul_b = arg->in_b_mul, sh_b = arg->in_b_shift; 473 int64_t off_o = arg->out_offset, mul_o = arg->out_mul, sh_o = arg->out_shift; 474 475 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address); 476 size_t i; 477 478 if (sh_a == sh_b) 479 { 944 480 #define QADD_UNROLL_1(x) \ 945 946 481 int64_t a##x = *src_a++; \ 482 int64_t b##x = *src_b++; 947 483 948 484 #define QADD_UNROLL_2(x) \ 949 950 485 a##x += off_a; \ 486 b##x += off_b; 951 487 952 488 #define QADD_UNROLL_3(x) \ 953 954 489 a##x *= mul_a; \ 490 b##x *= mul_b; 955 491 956 492 #define QADD_UNROLL_4(x) \ 957 493 int64_t v##x = a##x + b##x; 958 494 959 495 #define QADD_UNROLL_5(x) \ 960 496 v##x >>= sh_a; 961 497 962 498 #define QADD_UNROLL_6(x) \ 963 499 v##x *= mul_o; 964 500 965 501 #define QADD_UNROLL_7(x) \ 966 502 v##x = kpu_carry_shift(v##x, sh_o); 967 503 968 504 #define QADD_UNROLL_8(x) \ 969 505 v##x += off_o; 970 506 971 507 #define QADD_UNROLL_9(x) \ 972 508 v##x = min(0xFF, max(0, v##x)); 973 509 974 510 #define QADD_UNROLL_10(x) \ 975 511 *dest++ = v##x; 976 512 977 513 #define QADD_UNROLL_S(x) \ 978 QADD_UNROLL_##x(0) \ 979 QADD_UNROLL_##x(1) \ 980 QADD_UNROLL_##x(2) \ 981 QADD_UNROLL_##x(3) \ 982 QADD_UNROLL_##x(4) \ 983 QADD_UNROLL_##x(5) \ 984 QADD_UNROLL_##x(6) \ 985 QADD_UNROLL_##x(7) 986 987 for(i = 0; i < count; i++) 988 { 989 QADD_UNROLL_S(1); 990 QADD_UNROLL_S(2); 991 QADD_UNROLL_S(3); 992 QADD_UNROLL_S(4); 993 QADD_UNROLL_S(5); 994 QADD_UNROLL_S(6); 995 QADD_UNROLL_S(7); 996 QADD_UNROLL_S(8); 997 QADD_UNROLL_S(9); 998 QADD_UNROLL_S(10); 999 } 1000 } else 1001 { 514 QADD_UNROLL_##x(0) \ 515 QADD_UNROLL_##x(1) \ 516 QADD_UNROLL_##x(2) \ 517 QADD_UNROLL_##x(3) \ 518 QADD_UNROLL_##x(4) \ 519 QADD_UNROLL_##x(5) \ 520 QADD_UNROLL_##x(6) \ 521 QADD_UNROLL_##x(7) 522 523 for (i = 0; i < count; i++) 524 { 525 QADD_UNROLL_S(1); 526 QADD_UNROLL_S(2); 527 QADD_UNROLL_S(3); 528 QADD_UNROLL_S(4); 529 QADD_UNROLL_S(5); 530 QADD_UNROLL_S(6); 531 QADD_UNROLL_S(7); 532 QADD_UNROLL_S(8); 533 QADD_UNROLL_S(9); 534 QADD_UNROLL_S(10); 535 } 536 } 537 else 538 { 1002 539 #undef QADD_UNROLL_1 1003 540 #define QADD_UNROLL_1(x) \ 1004 1005 541 int64_t a##x = *src_a++; \ 542 int64_t b##x = *src_b++; 1006 543 1007 544 #undef QADD_UNROLL_2 1008 545 #define QADD_UNROLL_2(x) \ 1009 1010 546 a##x += off_a; \ 547 b##x += off_b; 1011 548 1012 549 #undef QADD_UNROLL_3 1013 550 #define QADD_UNROLL_3(x) \ 1014 1015 551 a##x *= mul_a; \ 552 b##x *= mul_b; 1016 553 1017 554 #undef QADD_UNROLL_4 1018 555 #define QADD_UNROLL_4(x) \ 1019 1020 556 a##x >>= sh_a; \ 557 b##x >>= sh_b; 1021 558 1022 559 #undef QADD_UNROLL_5 1023 560 #define QADD_UNROLL_5(x) \ 1024 561 int64_t v##x = a##x + b##x; 1025 562 1026 563 #undef QADD_UNROLL_6 1027 564 #define QADD_UNROLL_6(x) \ 1028 565 v##x *= mul_o; 1029 566 1030 567 #undef QADD_UNROLL_7 1031 568 #define QADD_UNROLL_7(x) \ 1032 569 v##x = kpu_carry_shift(v##x, sh_o); 1033 570 1034 571 #undef QADD_UNROLL_8 1035 572 #define QADD_UNROLL_8(x) \ 1036 573 v##x += off_o; 1037 574 1038 575 #undef QADD_UNROLL_9 1039 576 #define QADD_UNROLL_9(x) \ 1040 577 v##x = min(0xFF, max(0, v##x)); 1041 578 1042 579 #undef QADD_UNROLL_10 1043 580 #define QADD_UNROLL_10(x) \ 1044 581 *dest++ = v##x; 1045 582 1046 583 #undef QADD_UNROLL_S 1047 584 #define QADD_UNROLL_S(x) \ 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 for(i = 0; i < count; i++)1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 585 QADD_UNROLL_##x(0) \ 586 QADD_UNROLL_##x(1) \ 587 QADD_UNROLL_##x(2) \ 588 QADD_UNROLL_##x(3) \ 589 QADD_UNROLL_##x(4) \ 590 QADD_UNROLL_##x(5) \ 591 QADD_UNROLL_##x(6) \ 592 QADD_UNROLL_##x(7) 593 594 for (i = 0; i < count; i++) 595 { 596 QADD_UNROLL_S(1); 597 QADD_UNROLL_S(2); 598 QADD_UNROLL_S(3); 599 QADD_UNROLL_S(4); 600 QADD_UNROLL_S(5); 601 QADD_UNROLL_S(6); 602 QADD_UNROLL_S(7); 603 QADD_UNROLL_S(8); 604 QADD_UNROLL_S(9); 605 QADD_UNROLL_S(10); 606 } 607 } 1071 608 } 1072 609 1073 610 static void kpu_global_average_pool2d(const kpu_model_gap2d_layer_argument_t *arg, kpu_model_context_t *ctx) 1074 611 { 1075 1076 1077 1078 1079 for(oc = 0; oc < channels; oc++)1080 1081 1082 1083 for(i = 0; i < kernel_size; i++)1084 1085 1086 1087 612 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address); 613 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 614 size_t oc, channels = arg->channels, kernel_size = arg->kernel_size; 615 616 for (oc = 0; oc < channels; oc++) 617 { 618 float sum = 0.f; 619 size_t i; 620 for (i = 0; i < kernel_size; i++) 621 sum += *src++; 622 623 dest[oc] = sum / kernel_size; 624 } 1088 625 } 1089 626 1090 627 static void kpu_quantized_max_pool2d(const kpu_model_quant_max_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx) 1091 628 { 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 for(oc = 0; oc < out_shape.channels; oc++)1102 1103 1104 for(out_y = 0; out_y < out_shape.height; out_y++)1105 1106 for(out_x = 0; out_x < out_shape.width; out_x++)1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 for(kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)1118 1119 for(kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 629 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address); 630 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address); 631 kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape; 632 uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height; 633 uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height; 634 uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height; 635 636 uint32_t out_y, out_x, oc; 637 638 for (oc = 0; oc < out_shape.channels; oc++) 639 { 640 const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc; 641 for (out_y = 0; out_y < out_shape.height; out_y++) 642 { 643 for (out_x = 0; out_x < out_shape.width; out_x++) 644 { 645 int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width; 646 int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height; 647 int32_t kernel_x_start = max(0, -in_x_origin); 648 int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin); 649 int32_t kernel_y_start = max(0, -in_y_origin); 650 int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin); 651 uint8_t value = 0; 652 653 int32_t kernel_y, kernel_x; 654 for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++) 655 { 656 for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++) 657 { 658 int32_t in_x = in_x_origin + kernel_x; 659 int32_t in_y = in_y_origin + kernel_y; 660 value = max(value, channel_src[in_y * in_shape.width + in_x]); 661 } 662 } 663 664 *dest++ = value; 665 } 666 } 667 } 1131 668 } 1132 669 1133 670 static void kpu_average_pool2d(const kpu_model_ave_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx) 1134 671 { 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 for(oc = 0; oc < out_shape.channels; oc++)1145 1146 1147 for(out_y = 0; out_y < out_shape.height; out_y++)1148 1149 for(out_x = 0; out_x < out_shape.width; out_x++)1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 for(kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)1162 1163 for(kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 672 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address); 673 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 674 kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape; 675 uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height; 676 uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height; 677 uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height; 678 679 uint32_t out_y, out_x, oc; 680 681 for (oc = 0; oc < out_shape.channels; oc++) 682 { 683 const float *channel_src = src + in_shape.width * in_shape.height * oc; 684 for (out_y = 0; out_y < out_shape.height; out_y++) 685 { 686 for (out_x = 0; out_x < out_shape.width; out_x++) 687 { 688 int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width; 689 int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height; 690 int32_t kernel_x_start = max(0, -in_x_origin); 691 int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin); 692 int32_t kernel_y_start = max(0, -in_y_origin); 693 int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin); 694 float value = 0; 695 float kernel_count = 0; 696 697 int32_t kernel_y, kernel_x; 698 for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++) 699 { 700 for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++) 701 { 702 int32_t in_x = in_x_origin + kernel_x; 703 int32_t in_y = in_y_origin + kernel_y; 704 value += channel_src[in_y * in_shape.width + in_x]; 705 kernel_count++; 706 } 707 } 708 709 *dest++ = value / kernel_count; 710 } 711 } 712 } 1176 713 } 1177 714 1178 715 static void kpu_quantize(const kpu_model_quantize_layer_argument_t *arg, kpu_model_context_t *ctx) 1179 716 { 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 for(i = 0; i < count; i++)1190 1191 1192 if(value < 0)1193 1194 if(value > 0xFF)1195 1196 1197 717 size_t count = arg->count; 718 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address); 719 720 kpu_model_quant_param_t q = arg->quant_param; 721 722 float scale = 1.f / q.scale; 723 724 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->mem_out_address); 725 size_t i; 726 for (i = 0; i < count; i++) 727 { 728 int value = roundf((*src++ - q.bias) * scale); 729 if (value < 0) 730 value = 0; 731 if (value > 0xFF) 732 value = 0xFF; 733 *dest++ = (uint8_t)value; 734 } 1198 735 } 1199 736 1200 737 static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t *arg, kpu_model_context_t *ctx) 1201 738 { 1202 1203 1204 1205 1206 1207 for(oc = 0; oc < count; oc++)1208 739 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address); 740 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 741 size_t oc, count = arg->count; 742 kpu_model_quant_param_t q = arg->quant_param; 743 744 for (oc = 0; oc < count; oc++) 745 dest[oc] = *src++ * q.scale + q.bias; 1209 746 } 1210 747 1211 748 static void kpu_kmodel_channelwise_dequantize(const kpu_model_channelwise_dequant_argument_t *arg, kpu_model_context_t *ctx) 1212 749 { 1213 1214 1215 1216 1217 for(oc = 0; oc < channels; oc++)1218 1219 1220 1221 for(i = 0; i < count; i++)1222 1223 750 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address); 751 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 752 size_t oc, i, channels = arg->channels, count = arg->channel_size; 753 754 for (oc = 0; oc < channels; oc++) 755 { 756 const kpu_model_quant_param_t q = arg->quant_params[oc]; 757 758 for (i = 0; i < count; i++) 759 *dest++ = *src++ * q.scale + q.bias; 760 } 1224 761 } 1225 762 1226 763 static void kpu_requantize(const kpu_model_requantize_layer_argument_t *arg, kpu_model_context_t *ctx) 1227 764 { 1228 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address); 1229 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address); 1230 size_t oc, count = arg->count; 1231 const uint8_t *table = arg->table; 1232 1233 if(false && count % 8 == 0) 1234 { 1235 for(oc = 0; oc < count;) 1236 { 1237 dest[oc++] = table[*src++]; 1238 dest[oc++] = table[*src++]; 1239 dest[oc++] = table[*src++]; 1240 dest[oc++] = table[*src++]; 1241 dest[oc++] = table[*src++]; 1242 dest[oc++] = table[*src++]; 1243 dest[oc++] = table[*src++]; 1244 dest[oc++] = table[*src++]; 1245 } 1246 } else 1247 { 1248 for(oc = 0; oc < count; oc++) 1249 dest[oc] = table[src[oc]]; 1250 } 765 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address); 766 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address); 767 size_t oc, count = arg->count; 768 const uint8_t *table = arg->table; 769 770 if (false && count % 8 == 0) 771 { 772 for (oc = 0; oc < count;) 773 { 774 dest[oc++] = table[*src++]; 775 dest[oc++] = table[*src++]; 776 dest[oc++] = table[*src++]; 777 dest[oc++] = table[*src++]; 778 dest[oc++] = table[*src++]; 779 dest[oc++] = table[*src++]; 780 dest[oc++] = table[*src++]; 781 dest[oc++] = table[*src++]; 782 } 783 } 784 else 785 { 786 for (oc = 0; oc < count; oc++) 787 dest[oc] = table[src[oc]]; 788 } 1251 789 } 1252 790 1253 791 static void kpu_l2_normalization(const kpu_model_l2_norm_layer_argument_t *arg, kpu_model_context_t *ctx) 1254 792 { 1255 1256 1257 1258 1259 1260 1261 for(oc = 0; oc < channels; oc++)1262 1263 if(sum < epsilon)1264 1265 1266 for(oc = 0; oc < channels; oc++)1267 793 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address); 794 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 795 size_t oc, channels = arg->channels; 796 797 float sum = 0.f; 798 const float epsilon = 1e-10f; 799 for (oc = 0; oc < channels; oc++) 800 sum += src[oc] * src[oc]; 801 if (sum < epsilon) 802 sum = epsilon; 803 sum = 1.f / sqrtf(sum); 804 for (oc = 0; oc < channels; oc++) 805 dest[oc] = src[oc] * sum; 1268 806 } 1269 807 1270 808 static void kpu_softmax(const kpu_model_softmax_layer_argument_t *arg, kpu_model_context_t *ctx) 1271 809 { 1272 1273 1274 1275 1276 1277 for(oc = 0; oc < channels; oc++)1278 1279 1280 1281 for(oc = 0; oc < channels; oc++)1282 1283 1284 1285 1286 1287 1288 for(oc = 0; oc < channels; oc++)1289 810 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address); 811 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 812 size_t oc, channels = arg->channels; 813 814 float max = FLT_MIN; 815 for (oc = 0; oc < channels; oc++) 816 max = fmaxf(max, src[oc]); 817 818 float sum = 0.f; 819 for (oc = 0; oc < channels; oc++) 820 { 821 float value = expf(src[oc] - max); 822 sum += value; 823 dest[oc] = value; 824 } 825 826 for (oc = 0; oc < channels; oc++) 827 dest[oc] /= sum; 1290 828 } 1291 829 1292 830 static void kpu_concat(const kpu_model_concat_layer_argument_t *arg, kpu_model_context_t *ctx) 1293 831 { 1294 1295 1296 1297 for(i = 0; i < count; i++)1298 1299 1300 1301 1302 1303 832 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address); 833 uint32_t count = arg->input_count, i; 834 835 for (i = 0; i < count; i++) 836 { 837 kpu_model_memory_range_t input = arg->inputs_mem[i]; 838 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + input.start); 839 memcpy(dest, src, input.size); 840 dest += input.size; 841 } 1304 842 } 1305 843 1306 844 static void kpu_kmodel_fully_connected(const kpu_model_fully_connected_layer_argument_t *arg, kpu_model_context_t *ctx) 1307 845 { 1308 1309 1310 1311 1312 1313 1314 1315 1316 if(in_channels % 8 == 0)1317 846 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address); 847 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 848 uint32_t in_channels = arg->in_channels, out_channels = arg->out_channels, ic, oc; 849 float *weights = (float *)malloc(in_channels * out_channels * sizeof(float)); 850 float *bias = (float *)malloc(out_channels * sizeof(float)); 851 memcpy(weights, arg->weights, out_channels * in_channels * sizeof(float)); 852 memcpy(bias, arg->weights + in_channels * out_channels, out_channels * sizeof(float)); 853 854 if (in_channels % 8 == 0) 855 { 1318 856 #define FC_UNROLL_1(x) \ 1319 1320 857 float i##x = *c_src++; \ 858 float w##x = *c_weights++; 1321 859 1322 860 #define FC_UNROLL_2(x) \ 1323 861 sum += i##x * w##x; 1324 862 1325 863 #define FC_UNROLL_S(x) \ 1326 FC_UNROLL_##x(0) \ 1327 FC_UNROLL_##x(1) \ 1328 FC_UNROLL_##x(2) \ 1329 FC_UNROLL_##x(3) \ 1330 FC_UNROLL_##x(4) \ 1331 FC_UNROLL_##x(5) \ 1332 FC_UNROLL_##x(6) \ 1333 FC_UNROLL_##x(7) 1334 1335 for(oc = 0; oc < out_channels; oc++) 1336 { 1337 const float *c_src = src; 1338 const float *c_weights = weights + oc * in_channels; 1339 1340 float sum = 0.0f; 1341 for(ic = 0; ic < in_channels / 8; ic++) 1342 { 1343 FC_UNROLL_S(1); 1344 FC_UNROLL_S(2); 1345 } 1346 1347 dest[oc] = sum + bias[oc]; 1348 } 1349 } else 1350 { 1351 for(oc = 0; oc < out_channels; oc++) 1352 { 1353 const float *c_weights = weights + oc * in_channels; 1354 1355 float sum = 0.0f; 1356 for(ic = 0; ic < in_channels; ic++) 1357 sum += src[ic] * c_weights[ic]; 1358 dest[oc] = sum + bias[oc]; 1359 } 1360 } 1361 free(weights); 1362 free(bias); 1363 kpu_float_activation(dest, out_channels, arg->act); 864 FC_UNROLL_##x(0) \ 865 FC_UNROLL_##x(1) \ 866 FC_UNROLL_##x(2) \ 867 FC_UNROLL_##x(3) \ 868 FC_UNROLL_##x(4) \ 869 FC_UNROLL_##x(5) \ 870 FC_UNROLL_##x(6) \ 871 FC_UNROLL_##x(7) 872 873 for (oc = 0; oc < out_channels; oc++) 874 { 875 const float *c_src = src; 876 const float *c_weights = weights + oc * in_channels; 877 878 float sum = 0.0f; 879 for (ic = 0; ic < in_channels / 8; ic++) 880 { 881 FC_UNROLL_S(1); 882 FC_UNROLL_S(2); 883 } 884 885 dest[oc] = sum + bias[oc]; 886 } 887 } 888 else 889 { 890 for (oc = 0; oc < out_channels; oc++) 891 { 892 const float *c_weights = weights + oc * in_channels; 893 894 float sum = 0.0f; 895 for (ic = 0; ic < in_channels; ic++) 896 sum += src[ic] * c_weights[ic]; 897 dest[oc] = sum + bias[oc]; 898 } 899 } 900 free(weights); 901 free(bias); 902 kpu_float_activation(dest, out_channels, arg->act); 1364 903 } 1365 904 1366 905 static void kpu_tf_flatten(const kpu_model_tf_flatten_layer_argument_t *arg, kpu_model_context_t *ctx) 1367 906 { 1368 1369 1370 1371 1372 1373 for(oy = 0; oy < in_shape.height; oy++)1374 for(ox = 0; ox < in_shape.width; ox++)1375 for(oc = 0; oc < in_shape.channels; oc++)1376 907 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address); 908 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 909 kpu_model_shape_t in_shape = arg->shape; 910 uint32_t oc, oy, ox; 911 912 for (oy = 0; oy < in_shape.height; oy++) 913 for (ox = 0; ox < in_shape.width; ox++) 914 for (oc = 0; oc < in_shape.channels; oc++) 915 *dest++ = src[(oc * in_shape.height + oy) * in_shape.width + ox]; 1377 916 } 1378 917 1379 918 static void kpu_resize_nearest_neighbor(const kpu_model_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx) 1380 919 { 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 for(oc = 0; oc < in_shape.channels; oc++)1391 1392 1393 for(oy = 0; oy < out_height; oy++)1394 1395 1396 1397 for(ox = 0; ox < out_width; ox++)1398 1399 1400 1401 1402 1403 920 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address); 921 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 922 kpu_model_shape_t in_shape = arg->in_shape; 923 uint32_t out_width = arg->out_width, out_height = arg->out_height; 924 uint32_t oc, oy, ox; 925 926 float height_scale = (float)in_shape.height / out_height; 927 float width_scale = (float)in_shape.width / out_width; 928 929 for (oc = 0; oc < in_shape.channels; oc++) 930 { 931 const float *channel_src = src + in_shape.width * in_shape.height * oc; 932 for (oy = 0; oy < out_height; oy++) 933 { 934 uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1); 935 const float *y_origin = channel_src + in_y * in_shape.width; 936 for (ox = 0; ox < out_width; ox++) 937 { 938 uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1); 939 *dest++ = y_origin[in_x]; 940 } 941 } 942 } 1404 943 } 1405 944 1406 945 static void kpu_quant_resize_nearest_neighbor(const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx) 1407 946 { 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 for(oc = 0; oc < in_shape.channels; oc++)1418 1419 1420 for(oy = 0; oy < out_height; oy++)1421 1422 1423 1424 for(ox = 0; ox < out_width; ox++)1425 1426 1427 1428 1429 1430 947 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address); 948 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address); 949 kpu_model_shape_t in_shape = arg->in_shape; 950 uint32_t out_width = arg->out_width, out_height = arg->out_height; 951 uint32_t oc, oy, ox; 952 953 float height_scale = (float)in_shape.height / out_height; 954 float width_scale = (float)in_shape.width / out_width; 955 956 for (oc = 0; oc < in_shape.channels; oc++) 957 { 958 const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc; 959 for (oy = 0; oy < out_height; oy++) 960 { 961 uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1); 962 const uint8_t *y_origin = channel_src + in_y * in_shape.width; 963 for (ox = 0; ox < out_width; ox++) 964 { 965 uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1); 966 *dest++ = y_origin[in_x]; 967 } 968 } 969 } 1431 970 } 1432 971 1433 972 static void kpu_logistic(const kpu_model_logistic_layer_argument_t *arg, kpu_model_context_t *ctx) 1434 973 { 1435 1436 1437 1438 1439 for(oc = 0; oc < channels; oc++)1440 974 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address); 975 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 976 size_t oc, channels = arg->channels; 977 978 for (oc = 0; oc < channels; oc++) 979 dest[oc] = 1.f / (1.f + expf(-src[oc])); 1441 980 } 1442 981 1443 982 static void kpu_conv(const kpu_model_conv_layer_argument_t *arg, kpu_model_context_t *ctx) 1444 983 { 1445 volatile kpu_layer_argument_t layer = *(const volatile kpu_layer_argument_t *)(ctx->model_buffer + arg->layer_offset); 1446 layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - IOMEM; 1447 layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - IOMEM; 1448 layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - IOMEM; 1449 1450 if(arg->flags & KLF_MAIN_MEM_OUT) 1451 { 1452 dmac_channel_number_t dma_ch = ctx->dma_ch; 1453 uint8_t *dest = ctx->main_buffer + arg->main_mem_out_address; 1454 kpu->interrupt_clear.data = (kpu_config_interrupt_t){ 1455 .calc_done_int = 1, 1456 .layer_cfg_almost_empty_int = 1, 1457 .layer_cfg_almost_full_int = 1}; 1458 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 1459 .calc_done_int = 1, 1460 .layer_cfg_almost_empty_int = 1, 1461 .layer_cfg_almost_full_int = 1}; 1462 layer.dma_parameter.data.send_data_out = 1; 1463 select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ); 1464 if(ctx->current_layer < ctx->layers_length) 1465 dmac_set_irq(dma_ch, ai_step, ctx, 1); 1466 else 1467 dmac_set_irq(dma_ch, (plic_irq_callback_t)kpu_kmodel_done, ctx, 1); 1468 dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT, 1469 DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8); 1470 } else 1471 { 1472 kpu->interrupt_clear.data = (kpu_config_interrupt_t){ 1473 .calc_done_int = 1, 1474 .layer_cfg_almost_empty_int = 1, 1475 .layer_cfg_almost_full_int = 1}; 1476 1477 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 1478 .calc_done_int = 0, 1479 .layer_cfg_almost_empty_int = 1, 1480 .layer_cfg_almost_full_int = 1}; 1481 layer.interrupt_enabe.data.int_en = 1; 1482 } 1483 1484 kpu_send_layer((const kpu_layer_argument_t *)&layer); 984 volatile kpu_layer_argument_t layer = *(const volatile kpu_layer_argument_t *)(ctx->model_buffer + arg->layer_offset); 985 layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - IOMEM; 986 layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - IOMEM; 987 layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - IOMEM; 988 989 if (arg->flags & KLF_MAIN_MEM_OUT) 990 { 991 dmac_channel_number_t dma_ch = ctx->dma_ch; 992 uint8_t *dest = ctx->main_buffer + arg->main_mem_out_address; 993 kpu->interrupt_clear.data = (kpu_config_interrupt_t){ 994 .calc_done_int = 1, 995 .layer_cfg_almost_empty_int = 1, 996 .layer_cfg_almost_full_int = 1}; 997 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 998 .calc_done_int = 1, 999 .layer_cfg_almost_empty_int = 1, 1000 .layer_cfg_almost_full_int = 1}; 1001 layer.dma_parameter.data.send_data_out = 1; 1002 select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ); 1003 if (ctx->current_layer < ctx->layers_length) 1004 dmac_set_irq(dma_ch, ai_step, ctx, 1); 1005 else 1006 dmac_set_irq(dma_ch, (plic_irq_callback_t)kpu_kmodel_done, ctx, 1); 1007 dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT, 1008 DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8); 1009 } 1010 else 1011 { 1012 kpu->interrupt_clear.data = (kpu_config_interrupt_t){ 1013 .calc_done_int = 1, 1014 .layer_cfg_almost_empty_int = 1, 1015 .layer_cfg_almost_full_int = 1}; 1016 1017 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 1018 .calc_done_int = 0, 1019 .layer_cfg_almost_empty_int = 1, 1020 .layer_cfg_almost_full_int = 1}; 1021 layer.interrupt_enabe.data.int_en = 1; 1022 } 1023 1024 kpu_send_layer((const kpu_layer_argument_t *)&layer); 1485 1025 } 1486 1026 1487 1027 static void kpu_add_padding(const kpu_model_add_padding_layer_argument_t *arg, kpu_model_context_t *ctx) 1488 1028 { 1489 1029 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address); 1490 1030 #if USE_CACHED_AI_RAM 1491 1031 uint8_t *dest = (uint8_t *)(uintptr_t)(AI_RAM_BASE_ADDR + arg->kpu_mem_out_address * 64); 1492 1032 #else 1493 1033 uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + arg->kpu_mem_out_address * 64); 1494 1034 #endif 1495 1035 1496 1497 1498 1499 1500 1501 1502 for(oc = 0; oc < channels; oc++)1503 1504 1505 for(y = 0; y < 1; y++)1506 1507 1508 for(x = 0; x < 1; x++)1509 1510 1511 1036 uint32_t row_padding = 16; 1037 uint32_t row_group = 4; 1038 uint32_t row_length = 1; 1039 uint32_t height = 4; 1040 uint32_t oc, x, y, channels = arg->channels; 1041 1042 for (oc = 0; oc < channels; oc++) 1043 { 1044 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; 1045 for (y = 0; y < 1; y++) 1046 { 1047 uint8_t *y_origin = channel_origin + y * row_length * 64; 1048 for (x = 0; x < 1; x++) 1049 y_origin[x] = *src++; 1050 } 1051 } 1512 1052 1513 1053 #if USE_CACHED_AI_RAM 1514 1515 1054 uint32_t lines = row_length * height * channels / row_group; 1055 kpu_flush_cache(arg->kpu_mem_out_address, lines); 1516 1056 #endif 1517 1057 } … … 1519 1059 static void kpu_remove_padding(const kpu_model_remove_padding_layer_argument_t *arg, kpu_model_context_t *ctx) 1520 1060 { 1521 1522 1523 1524 1525 for(oc = 0; oc < channels; oc++)1526 1061 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address); 1062 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address); 1063 uint32_t oc, channels = arg->channels; 1064 1065 for (oc = 0; oc < channels; oc++) 1066 *dest++ = src[oc * 16]; 1527 1067 } 1528 1068 1529 1069 static void kpu_upload(const kpu_model_upload_layer_argument_t *arg, kpu_model_context_t *ctx) 1530 1070 { 1531 1532 1533 1534 1535 1071 size_t width = arg->width; 1072 size_t height = arg->height; 1073 size_t channels = arg->channels; 1074 1075 kpu_upload_core(width, height, channels, ctx->main_buffer + arg->main_mem_in_address, arg->kpu_mem_out_address); 1536 1076 } 1537 1077 … … 1539 1079 { 1540 1080 #if FIX_CACHE 1541 1081 configASSERT(is_memory_cache((uintptr_t)buffer)); 1542 1082 #endif 1543 uintptr_t base_addr = (uintptr_t)buffer; 1544 const kpu_kmodel_header_t *header = (const kpu_kmodel_header_t *)buffer; 1545 1546 if (header->version == 3 && header->arch == 0) 1547 { 1548 ctx->is_nncase = 0; 1549 ctx->model_buffer = buffer; 1550 ctx->output_count = header->output_count; 1551 ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t)); 1552 ctx->layer_headers = (const kpu_model_layer_header_t *)((uintptr_t)ctx->outputs + sizeof(kpu_model_output_t) * ctx->output_count); 1553 ctx->layers_length = header->layers_length; 1554 ctx->body_start = (const uint8_t *)((uintptr_t)ctx->layer_headers + sizeof(kpu_model_layer_header_t) * header->layers_length); 1555 ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage); 1556 if (!ctx->main_buffer) 1557 return -1; 1558 uint32_t body_size = 0; 1559 for (int i=0; i<ctx->layers_length; i++) 1560 { 1561 const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + i; 1562 body_size += cnt_layer_header->body_size; 1563 } 1564 uint8_t *body_start_iomem = (uint8_t *)((uintptr_t)ctx->body_start - IOMEM); 1565 const uint8_t *body_start_cache = ctx->body_start; 1566 memcpy(body_start_iomem, body_start_cache, body_size); 1567 for (int i=0; i<body_size; i++) 1568 { 1569 configASSERT(body_start_iomem[i] == body_start_cache[i]); 1570 } 1571 1572 } else 1573 { 1574 return -1; 1575 } 1576 1577 return 0; 1083 uintptr_t base_addr = (uintptr_t)buffer; 1084 const kpu_kmodel_header_t *header = (const kpu_kmodel_header_t *)buffer; 1085 1086 if (header->version == 3 && header->arch == 0) 1087 { 1088 ctx->model_buffer = buffer; 1089 ctx->output_count = header->output_count; 1090 ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t)); 1091 ctx->layer_headers = (const kpu_model_layer_header_t *)((uintptr_t)ctx->outputs + sizeof(kpu_model_output_t) * ctx->output_count); 1092 ctx->layers_length = header->layers_length; 1093 ctx->body_start = (const uint8_t *)((uintptr_t)ctx->layer_headers + sizeof(kpu_model_layer_header_t) * header->layers_length); 1094 ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage); 1095 if (!ctx->main_buffer) 1096 return -1; 1097 uint32_t body_size = 0; 1098 for (int i = 0; i < ctx->layers_length; i++) 1099 { 1100 const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + i; 1101 body_size += cnt_layer_header->body_size; 1102 } 1103 uint8_t *body_start_iomem = (uint8_t *)((uintptr_t)ctx->body_start - IOMEM); 1104 const uint8_t *body_start_cache = ctx->body_start; 1105 memcpy(body_start_iomem, body_start_cache, body_size); 1106 for (int i = 0; i < body_size; i++) 1107 { 1108 configASSERT(body_start_iomem[i] == body_start_cache[i]); 1109 } 1110 } 1111 else 1112 { 1113 return -1; 1114 } 1115 1116 return 0; 1578 1117 } 1579 1118 1580 1119 int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size) 1581 1120 { 1582 if(ctx->is_nncase) 1583 return -1; 1584 1585 if(index >= ctx->output_count) 1586 return -1; 1587 1588 const kpu_model_output_t *output = ctx->outputs + index; 1589 *data = ctx->main_buffer + output->address; 1590 *size = output->size; 1591 return 0; 1121 if (index >= ctx->output_count) 1122 return -1; 1123 1124 const kpu_model_output_t *output = ctx->outputs + index; 1125 *data = ctx->main_buffer + output->address; 1126 *size = output->size; 1127 return 0; 1592 1128 } 1593 1129 1594 1130 void kpu_model_free(kpu_model_context_t *ctx) 1595 1131 { 1596 if(ctx->is_nncase) 1597 return; 1598 1599 free(ctx->main_buffer); 1600 ctx->main_buffer = NULL; 1132 free(ctx->main_buffer); 1133 ctx->main_buffer = NULL; 1601 1134 } 1602 1135 … … 1609 1142 static const char *str_layer_type(uint32_t type) 1610 1143 { 1611 switch(type)1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1144 switch (type) 1145 { 1146 case KL_ADD: 1147 return "Add"; 1148 case KL_QUANTIZED_ADD: 1149 return "QuantAdd"; 1150 case KL_GLOBAL_AVERAGE_POOL2D: 1151 return "GAP"; 1152 case KL_QUANTIZED_MAX_POOL2D: 1153 return "QuantMaxPool2d"; 1154 case KL_AVERAGE_POOL2D: 1155 return "AveragePool2d"; 1156 case KL_QUANTIZE: 1157 return "Quantize"; 1158 case KL_DEQUANTIZE: 1159 return "Dequantize"; 1160 case KL_REQUANTIZE: 1161 return "Requantize"; 1162 case KL_L2_NORMALIZATION: 1163 return "L2Norm"; 1164 case KL_SOFTMAX: 1165 return "Softmax"; 1166 case KL_CONCAT: 1167 return "Concat"; 1168 case KL_QUANTIZED_CONCAT: 1169 return "QuantConcat"; 1170 case KL_FULLY_CONNECTED: 1171 return "FullyConnected"; 1172 case KL_TENSORFLOW_FLATTEN: 1173 return "TFFlatten"; 1174 case KL_RESIZE_NEAREST_NEIGHBOR: 1175 return "ResizeNearestNeighbor"; 1176 case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR: 1177 return "QuantResizeNearestNeighbor"; 1178 case KL_CHANNELWISE_DEQUANTIZE: 1179 return "ChannelwiseDequantize"; 1180 case KL_LOGISTIC: 1181 return "Logistic"; 1182 case KL_K210_CONV: 1183 return "K210Conv"; 1184 case KL_K210_ADD_PADDING: 1185 return "K210AddPad"; 1186 case KL_K210_REMOVE_PADDING: 1187 return "K210RemovePad"; 1188 case KL_K210_UPLOAD: 1189 return "K210Upload"; 1190 default: 1191 return "Unknown"; 1192 } 1660 1193 } 1661 1194 #endif … … 1663 1196 static int kpu_kmodel_done(kpu_model_context_t *ctx) 1664 1197 { 1665 1666 1667 1668 1669 1670 1671 1672 1198 kpu->interrupt_clear.data = (kpu_config_interrupt_t){ 1199 .calc_done_int = 1, 1200 .layer_cfg_almost_empty_int = 1, 1201 .layer_cfg_almost_full_int = 1}; 1202 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 1203 .calc_done_int = 1, 1204 .layer_cfg_almost_empty_int = 1, 1205 .layer_cfg_almost_full_int = 1}; 1673 1206 #if KPU_DEBUG 1674 uint32_t cnt_layer_id = ctx->current_layer - 1;1675 1676 if(last_time != 0)1677 1678 1679 syslog(LOG_NOTICE, "layer %d [%s]: %f ms", cnt_layer_id, str_layer_type(last_layer_type), layer_time / 1000.0);1680 1681 if(last_layer_type == KL_K210_CONV)1682 1683 1684 1685 syslog(LOG_NOTICE, "KPU: %f ms", kpu_time / 1000.0);1686 syslog(LOG_NOTICE, "CPU: %f ms", (total_time - kpu_time) / 1000.0);1687 syslog(LOG_NOTICE, "Model: %f ms", total_time / 1000.0);1207 uint32_t cnt_layer_id = ctx->current_layer; 1208 uint64_t time = sysctl_get_time_us(); 1209 if (last_time != 0) 1210 { 1211 uint64_t layer_time = time - last_time; 1212 syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000); 1213 total_time += layer_time; 1214 if (last_layer_type == KL_K210_CONV) 1215 kpu_time += layer_time; 1216 } 1217 1218 syslog(LOG_NOTICE, "KPU: %d.%03d ms", kpu_time / 1000, kpu_time % 1000); 1219 syslog(LOG_NOTICE, "CPU: %d.%03d ms", (total_time - kpu_time) / 1000, (total_time - kpu_time) % 1000); 1220 syslog(LOG_NOTICE, "Model: %d.%03d ms", total_time / 1000, total_time % 1000); 1688 1221 #endif 1689 1690 1222 ctx->done_callback(ctx->userdata); 1223 return 0; 1691 1224 } 1692 1225 1693 1226 static int ai_step(void *userdata) 1694 1227 { 1695 kpu_model_context_t *ctx = (kpu_model_context_t *)userdata; 1696 1697 uint32_t cnt_layer_id = ctx->current_layer; 1698 const uint8_t *layer_body = ctx->current_body; 1699 const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + cnt_layer_id; 1700 if (cnt_layer_id >= ctx->layers_length) { 1701 //syslog(LOG_NOTICE, "overrun"); 1702 kpu_kmodel_done(ctx); 1703 return -1; 1704 } 1705 1706 ctx->current_layer++; 1707 ctx->current_body += cnt_layer_header->body_size; 1228 kpu_model_context_t *ctx = (kpu_model_context_t *)userdata; 1229 1230 uint32_t cnt_layer_id = ctx->current_layer; 1231 const uint8_t *layer_body = ctx->current_body; 1232 const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + cnt_layer_id; 1233 if (cnt_layer_id >= ctx->layers_length) 1234 { 1235 //syslog(LOG_NOTICE, "overrun"); 1236 kpu_kmodel_done(ctx); 1237 return -1; 1238 } 1239 1240 ctx->current_layer++; 1241 ctx->current_body += cnt_layer_header->body_size; 1708 1242 1709 1243 #if KPU_DEBUG 1710 1711 if(last_time != 0)1712 1713 1714 1715 1716 if(last_layer_type == KL_K210_CONV)1717 1718 1719 1720 1721 1244 uint64_t time = sysctl_get_time_us(); 1245 if (last_time != 0) 1246 { 1247 uint64_t layer_time = time - last_time; 1248 syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000); 1249 total_time += layer_time; 1250 if (last_layer_type == KL_K210_CONV) 1251 kpu_time += layer_time; 1252 } 1253 1254 last_layer_type = cnt_layer_header->type; 1255 last_time = sysctl_get_time_us(); 1722 1256 #endif 1723 1257 1724 switch(cnt_layer_header->type)1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 if (ctx->current_layer < ctx->layers_length)1797 1798 1799 1800 1258 switch (cnt_layer_header->type) 1259 { 1260 case KL_ADD: 1261 kpu_kmodel_add((const kpu_model_add_layer_argument_t *)layer_body, ctx); 1262 break; 1263 case KL_QUANTIZED_ADD: 1264 kpu_quantized_add((const kpu_model_quant_add_layer_argument_t *)layer_body, ctx); 1265 break; 1266 case KL_GLOBAL_AVERAGE_POOL2D: 1267 kpu_global_average_pool2d((const kpu_model_gap2d_layer_argument_t *)layer_body, ctx); 1268 break; 1269 case KL_QUANTIZED_MAX_POOL2D: 1270 kpu_quantized_max_pool2d((const kpu_model_quant_max_pool2d_layer_argument_t *)layer_body, ctx); 1271 break; 1272 case KL_AVERAGE_POOL2D: 1273 kpu_average_pool2d((const kpu_model_ave_pool2d_layer_argument_t *)layer_body, ctx); 1274 break; 1275 case KL_QUANTIZE: 1276 kpu_quantize((const kpu_model_quantize_layer_argument_t *)layer_body, ctx); 1277 break; 1278 case KL_DEQUANTIZE: 1279 kpu_kmodel_dequantize((const kpu_model_dequantize_layer_argument_t *)layer_body, ctx); 1280 break; 1281 case KL_REQUANTIZE: 1282 kpu_requantize((const kpu_model_requantize_layer_argument_t *)layer_body, ctx); 1283 break; 1284 case KL_L2_NORMALIZATION: 1285 kpu_l2_normalization((const kpu_model_l2_norm_layer_argument_t *)layer_body, ctx); 1286 break; 1287 case KL_SOFTMAX: 1288 kpu_softmax((const kpu_model_softmax_layer_argument_t *)layer_body, ctx); 1289 break; 1290 case KL_CONCAT: 1291 case KL_QUANTIZED_CONCAT: 1292 kpu_concat((const kpu_model_concat_layer_argument_t *)layer_body, ctx); 1293 break; 1294 case KL_FULLY_CONNECTED: 1295 kpu_kmodel_fully_connected((const kpu_model_fully_connected_layer_argument_t *)layer_body, ctx); 1296 break; 1297 case KL_TENSORFLOW_FLATTEN: 1298 kpu_tf_flatten((const kpu_model_tf_flatten_layer_argument_t *)layer_body, ctx); 1299 break; 1300 case KL_RESIZE_NEAREST_NEIGHBOR: 1301 kpu_resize_nearest_neighbor((const kpu_model_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx); 1302 break; 1303 case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR: 1304 kpu_quant_resize_nearest_neighbor((const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx); 1305 break; 1306 case KL_CHANNELWISE_DEQUANTIZE: 1307 kpu_kmodel_channelwise_dequantize((const kpu_model_channelwise_dequant_argument_t *)layer_body, ctx); 1308 break; 1309 case KL_LOGISTIC: 1310 kpu_logistic((const kpu_model_logistic_layer_argument_t *)layer_body, ctx); 1311 break; 1312 case KL_K210_CONV: 1313 kpu_conv((const kpu_model_conv_layer_argument_t *)layer_body, ctx); 1314 return 0; 1315 case KL_K210_ADD_PADDING: 1316 kpu_add_padding((const kpu_model_add_padding_layer_argument_t *)layer_body, ctx); 1317 break; 1318 case KL_K210_REMOVE_PADDING: 1319 kpu_remove_padding((const kpu_model_remove_padding_layer_argument_t *)layer_body, ctx); 1320 break; 1321 case KL_K210_UPLOAD: 1322 kpu_upload((const kpu_model_upload_layer_argument_t *)layer_body, ctx); 1323 break; 1324 default: 1325 assert(!"Layer is not supported."); 1326 kpu_kmodel_done(ctx); 1327 return -1; 1328 } 1329 1330 if (ctx->current_layer < (ctx->layers_length - 1)) 1331 ai_step(userdata); 1332 else 1333 kpu_kmodel_done(ctx); 1334 return 0; 1801 1335 } 1802 1336 1803 1337 static void ai_step_not_isr(void *userdata) 1804 1338 { 1805 sysctl_disable_irq(); 1806 ai_step(userdata); 1807 sysctl_enable_irq(); 1339 dis_int(INTNO_DMAAI); 1340 dis_int(INTNO_AI); 1341 1342 ai_step(userdata); 1343 1344 ena_int(INTNO_DMAAI); 1345 ena_int(INTNO_AI); 1808 1346 } 1809 1347 1810 1348 int kpu_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata) 1811 1349 { 1812 if(ctx->is_nncase) 1813 return -1; 1814 1815 ctx->dma_ch = dma_ch; 1816 ctx->done_callback = done_callback; 1817 ctx->userdata = userdata; 1818 ctx->current_layer = 0; 1819 ctx->current_body = ctx->body_start; 1350 ctx->dma_ch = dma_ch; 1351 ctx->done_callback = done_callback; 1352 ctx->userdata = userdata; 1353 ctx->current_layer = 0; 1354 ctx->current_body = ctx->body_start; 1820 1355 #if KPU_DEBUG 1821 1822 1823 1356 last_time = 0; 1357 total_time = 0; 1358 kpu_time = 0; 1824 1359 #endif 1825 1360 1826 kpu_kmodel_header_t *header = (kpu_kmodel_header_t *)ctx->model_buffer; 1827 kpu->interrupt_clear.reg = 7; 1828 kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){ 1829 .fifo_full_threshold = 10, .fifo_empty_threshold = 1}; 1830 kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){ 1831 .eight_bit_mode = header->flags & 1}; 1832 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 1833 .calc_done_int = 1, 1834 .layer_cfg_almost_empty_int = 0, 1835 .layer_cfg_almost_full_int = 1}; 1836 1837 plic_set_priority(INTNO_AI, 1); 1838 plic_irq_register(INTNO_AI, ai_step, ctx); 1839 plic_irq_enable(INTNO_AI); 1840 1841 const kpu_model_layer_header_t *first_layer_header = ctx->layer_headers; 1842 1843 switch(first_layer_header->type) 1844 { 1845 case KL_K210_CONV: 1846 { 1847 const kpu_model_conv_layer_argument_t *first_layer = (const kpu_model_conv_layer_argument_t *)ctx->body_start; 1848 kpu_layer_argument_t layer_arg = *(volatile kpu_layer_argument_t *)(ctx->model_buffer + first_layer->layer_offset); 1849 1850 if((layer_arg.image_size.data.i_row_wid + 1) % 64 != 0) 1851 { 1852 kpu_kmodel_input_with_padding(&layer_arg, src); 1853 ai_step_not_isr(ctx); 1854 } else 1855 { 1856 kpu_input_dma(&layer_arg, src, ctx->dma_ch, ai_step, ctx); 1857 } 1858 } 1859 break; 1860 case KL_FULLY_CONNECTED: 1861 { 1862 const kpu_model_fully_connected_layer_argument_t *first_layer = (const kpu_model_fully_connected_layer_argument_t *)ctx->body_start; 1863 kpu_kmodel_input_float((const float *)src, (float *)(ctx->main_buffer + first_layer->main_mem_in_address), first_layer->in_channels); 1864 ai_step_not_isr(ctx); 1865 } 1866 break; 1867 default: 1868 return -1; 1869 } 1870 1871 return 0; 1872 } 1361 kpu_kmodel_header_t *header = (kpu_kmodel_header_t *)ctx->model_buffer; 1362 kpu->interrupt_clear.reg = 7; 1363 kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){ 1364 .fifo_full_threshold = 10, .fifo_empty_threshold = 1}; 1365 kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){ 1366 .eight_bit_mode = header->flags & 1}; 1367 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 1368 .calc_done_int = 1, 1369 .layer_cfg_almost_empty_int = 0, 1370 .layer_cfg_almost_full_int = 1}; 1371 1372 //plic_set_priority(INTNO_AI, 1); 1373 plic_irq_register(INTNO_AI, ai_step, ctx); 1374 plic_irq_enable(INTNO_AI); 1375 1376 const kpu_model_layer_header_t *first_layer_header = ctx->layer_headers; 1377 1378 switch (first_layer_header->type) 1379 { 1380 case KL_K210_CONV: 1381 { 1382 const kpu_model_conv_layer_argument_t *first_layer = (const kpu_model_conv_layer_argument_t *)ctx->body_start; 1383 kpu_layer_argument_t layer_arg = *(volatile kpu_layer_argument_t *)(ctx->model_buffer + first_layer->layer_offset); 1384 1385 if ((layer_arg.image_size.data.i_row_wid + 1) % 64 != 0) 1386 { 1387 kpu_kmodel_input_with_padding(&layer_arg, src); 1388 ai_step_not_isr(ctx); 1389 } 1390 else 1391 { 1392 kpu_input_dma(&layer_arg, src, ctx->dma_ch, ai_step, ctx); 1393 } 1394 } 1395 break; 1396 case KL_FULLY_CONNECTED: 1397 { 1398 const kpu_model_fully_connected_layer_argument_t *first_layer = (const kpu_model_fully_connected_layer_argument_t *)ctx->body_start; 1399 kpu_kmodel_input_float((const float *)src, (float *)(ctx->main_buffer + first_layer->main_mem_in_address), first_layer->in_channels); 1400 ai_step_not_isr(ctx); 1401 } 1402 break; 1403 default: 1404 return -1; 1405 } 1406 1407 return 0; 1408 } 1409 1410 ER kpu_init(kpu_model_context_t *ctx) 1411 { 1412 g_ai_hdma.chnum = AI_DMA_CH; 1413 g_ai_hdma.xfercallback = ai_dma_done_isr; 1414 g_ai_hdma.errorcallback = NULL; 1415 g_ai_hdma.Init.Request = DMA_SELECT_AI_RX_REQ; /* DMA選択 */ 1416 g_ai_hdma.Init.Direction = DMA_PERIPH_TO_MEMORY; /* DMA転送方向 */ 1417 g_ai_hdma.Init.SrcMultBlock = DMAC_MULTBLOCK_CONT; /* ソースマルチブロックタイプ */ 1418 g_ai_hdma.Init.DrcMultBlock = DMAC_MULTBLOCK_CONT; /* デスティネーションマルチブロックタイプ */ 1419 g_ai_hdma.Init.SrcHandShake = DMAC_HS_HARDWARE; /* ソースハンドシェイク */ 1420 g_ai_hdma.Init.DrcHandShake = DMAC_HS_SOFTWARE; /* デスティネーションハンドシェイク */ 1421 g_ai_hdma.Init.SrcHwhsPol = DMAC_HWHS_POLARITY_LOW; /* ソースハードウェアハンドシェイク極性 */ 1422 g_ai_hdma.Init.DrcHwhsPol = DMAC_HWHS_POLARITY_LOW; /* デスティネーションハードウェアハンドシェイク極性 */ 1423 g_ai_hdma.Init.Priority = 4; /* 優先度 */ 1424 g_ai_hdma.Init.SrcMaster = DMAC_MASTER1; /* ソースマスター設定 */ 1425 g_ai_hdma.Init.DstMaster = DMAC_MASTER2; /* デスティネーションマスター設定 */ 1426 g_ai_hdma.Init.SrcInc = DMAC_ADDR_NOCHANGE; /* ソースインクリメント設定 */ 1427 g_ai_hdma.Init.DstInc = DMAC_ADDR_INCREMENT; /* デスティネーションインクリメント設定 */ 1428 g_ai_hdma.Init.SrcTransWidth = DMAC_TRANS_WIDTH_32; /* ソース転送幅 */ 1429 g_ai_hdma.Init.DstTransWidth = DMAC_TRANS_WIDTH_32; /* デスティネーション転送幅 */ 1430 g_ai_hdma.Init.SrcBurstSize = DMAC_MSIZE_4; /* ソースバーストサイズ */ 1431 g_ai_hdma.Init.DstBurstSize = DMAC_MSIZE_4; /* デスティネーションバーストサイズ */ 1432 g_ai_hdma.Init.IocBlkTrans = 0; /* IOCブロック転送 */ 1433 g_ai_hdma.localdata = (void *)ctx; 1434 1435 return dma_init(&g_ai_hdma); 1436 } -
azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/kpu.h
r453 r458 24 24 #endif 25 25 26 #define kpu_matmul_begin kpu_conv2d_output27 26 #define IOMEM 0x40000000 27 #define dmac_channel_number_t int 28 28 29 29 typedef int (*plic_irq_callback_t)(void *ctx); … … 31 31 typedef struct 32 32 { 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 33 union 34 { 35 uint64_t reg; 36 struct 37 { 38 uint64_t int_en : 1; 39 uint64_t ram_flag : 1; 40 uint64_t full_add : 1; 41 uint64_t depth_wise_layer : 1; 42 uint64_t reserved : 60; 43 } data; 44 } interrupt_enabe; 45 46 union 47 { 48 uint64_t reg; 49 struct 50 { 51 uint64_t image_src_addr : 15; 52 uint64_t reserved0 : 17; 53 uint64_t image_dst_addr : 15; 54 uint64_t reserved1 : 17; 55 } data; 56 } image_addr; 57 58 union 59 { 60 uint64_t reg; 61 struct 62 { 63 uint64_t i_ch_num : 10; 64 uint64_t reserved0 : 22; 65 uint64_t o_ch_num : 10; 66 uint64_t reserved1 : 6; 67 uint64_t o_ch_num_coef : 10; 68 uint64_t reserved2 : 6; 69 } data; 70 } image_channel_num; 71 72 union 73 { 74 uint64_t reg; 75 struct 76 { 77 uint64_t i_row_wid : 10; 78 uint64_t i_col_high : 9; 79 uint64_t reserved0 : 13; 80 uint64_t o_row_wid : 10; 81 uint64_t o_col_high : 9; 82 uint64_t reserved1 : 13; 83 } data; 84 } image_size; 85 86 union 87 { 88 uint64_t reg; 89 struct 90 { 91 uint64_t kernel_type : 3; 92 uint64_t pad_type : 1; 93 uint64_t pool_type : 4; 94 uint64_t first_stride : 1; 95 uint64_t bypass_conv : 1; 96 uint64_t load_para : 1; 97 uint64_t reserved0 : 5; 98 uint64_t dma_burst_size : 8; 99 uint64_t pad_value : 8; 100 uint64_t bwsx_base_addr : 32; 101 } data; 102 } kernel_pool_type_cfg; 103 104 union 105 { 106 uint64_t reg; 107 struct 108 { 109 uint64_t load_coor : 1; 110 uint64_t load_time : 6; 111 uint64_t reserved0 : 8; 112 uint64_t para_size : 17; 113 uint64_t para_start_addr : 32; 114 } data; 115 } kernel_load_cfg; 116 117 union 118 { 119 uint64_t reg; 120 struct 121 { 122 uint64_t coef_column_offset : 4; 123 uint64_t coef_row_offset : 12; 124 uint64_t reserved0 : 48; 125 } data; 126 } kernel_offset; 127 128 union 129 { 130 uint64_t reg; 131 struct 132 { 133 uint64_t channel_switch_addr : 15; 134 uint64_t reserved : 1; 135 uint64_t row_switch_addr : 4; 136 uint64_t coef_size : 8; 137 uint64_t coef_group : 3; 138 uint64_t load_act : 1; 139 uint64_t active_addr : 32; 140 } data; 141 } kernel_calc_type_cfg; 142 143 union 144 { 145 uint64_t reg; 146 struct 147 { 148 uint64_t wb_channel_switch_addr : 15; 149 uint64_t reserved0 : 1; 150 uint64_t wb_row_switch_addr : 4; 151 uint64_t wb_group : 3; 152 uint64_t reserved1 : 41; 153 } data; 154 } write_back_cfg; 155 156 union 157 { 158 uint64_t reg; 159 struct 160 { 161 uint64_t shr_w : 4; 162 uint64_t shr_x : 4; 163 uint64_t arg_w : 24; 164 uint64_t arg_x : 24; 165 uint64_t reserved0 : 8; 166 } data; 167 } conv_value; 168 169 union 170 { 171 uint64_t reg; 172 struct 173 { 174 uint64_t arg_add : 40; 175 uint64_t reserved : 24; 176 } data; 177 } conv_value2; 178 179 union 180 { 181 uint64_t reg; 182 struct 183 { 184 uint64_t send_data_out : 1; 185 uint64_t reserved : 15; 186 uint64_t channel_byte_num : 16; 187 uint64_t dma_total_byte : 32; 188 } data; 189 } dma_parameter; 190 190 } kpu_layer_argument_t; 191 191 192 192 typedef struct 193 193 { 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 194 union 195 { 196 uint64_t reg; 197 struct 198 { 199 uint64_t shift_number : 8; 200 uint64_t y_mul : 16; 201 uint64_t x_start : 36; 202 } data; 203 } activate_para[16]; 204 205 union 206 { 207 uint64_t reg; 208 struct 209 { 210 uint8_t result_bias[8]; 211 } data; 212 } activate_para_bias0; 213 214 union 215 { 216 uint64_t reg; 217 struct 218 { 219 uint8_t result_bias[8]; 220 } data; 221 } activate_para_bias1; 222 222 } kpu_activate_table_t; 223 223 224 224 typedef struct 225 225 { 226 227 228 229 230 231 232 233 234 235 226 union 227 { 228 uint64_t reg; 229 struct 230 { 231 uint64_t norm_mul : 24; 232 uint64_t norm_add : 32; 233 uint64_t norm_shift : 4; 234 } data; 235 } batchnorm; 236 236 } kpu_batchnorm_argument_t; 237 237 238 238 typedef struct 239 239 { 240 241 242 243 244 245 246 247 240 union 241 { 242 uint64_t reg; 243 struct 244 { 245 uint16_t weight[9]; 246 } data; 247 } weights; 248 248 } kpu_weights_kernel_16_3x3_t; 249 249 250 250 typedef struct 251 251 { 252 253 254 255 252 uint64_t calc_done_int : 1; 253 uint64_t layer_cfg_almost_empty_int : 1; 254 uint64_t layer_cfg_almost_full_int : 1; 255 uint64_t reserved : 61; 256 256 } kpu_config_interrupt_t; 257 257 258 258 typedef struct 259 259 { 260 261 262 260 uint64_t fifo_full_threshold : 4; 261 uint64_t fifo_empty_threshold : 4; 262 uint64_t reserved : 56; 263 263 } kpu_config_fifo_threshold_t; 264 264 265 265 typedef struct 266 266 { 267 268 269 270 271 272 267 uint64_t dma_fifo_flush_n : 1; 268 uint64_t gs_fifo_flush_n : 1; 269 uint64_t cfg_fifo_flush_n : 1; 270 uint64_t cmd_fifo_flush_n : 1; 271 uint64_t resp_fifo_flush_n : 1; 272 uint64_t reserved : 59; 273 273 } kpu_config_fifo_ctrl_t; 274 274 275 275 typedef struct 276 276 { 277 278 277 uint64_t eight_bit_mode : 1; 278 uint64_t reserved : 63; 279 279 } kpu_config_eight_bit_mode_t; 280 280 281 281 typedef struct 282 282 { 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 283 volatile uint64_t layer_argument_fifo; 284 285 volatile union 286 { 287 uint64_t reg; 288 kpu_config_interrupt_t data; 289 } interrupt_status; 290 291 volatile union 292 { 293 uint64_t reg; 294 kpu_config_interrupt_t data; 295 } interrupt_raw; 296 297 volatile union 298 { 299 uint64_t reg; 300 kpu_config_interrupt_t data; 301 } interrupt_mask; 302 303 volatile union 304 { 305 uint64_t reg; 306 kpu_config_interrupt_t data; 307 } interrupt_clear; 308 309 volatile union 310 { 311 uint64_t reg; 312 kpu_config_fifo_threshold_t data; 313 } fifo_threshold; 314 315 volatile uint64_t fifo_data_out; 316 317 volatile union 318 { 319 uint64_t reg; 320 kpu_config_fifo_ctrl_t data; 321 } fifo_ctrl; 322 323 volatile union 324 { 325 uint64_t reg; 326 kpu_config_eight_bit_mode_t data; 327 } eight_bit_mode; 328 328 } kpu_config_t; 329 329 330 #define dmac_channel_number_t int 331 332 typedef struct 333 { 334 kpu_layer_argument_t *layers; 335 kpu_layer_argument_t *remain_layers; 336 plic_irq_callback_t callback; 337 void *ctx; 338 uint64_t *src; 339 uint64_t *dst; 340 uint32_t src_length; 341 uint32_t dst_length; 342 uint32_t layers_length; 343 uint32_t remain_layers_length; 344 dmac_channel_number_t dma_ch; 345 uint32_t eight_bit_mode; 346 float output_scale; 347 float output_bias; 348 float input_scale; 349 float input_bias; 350 } kpu_task_t; 351 352 typedef struct 353 { 354 uint32_t version; 355 uint32_t flags; 356 uint32_t arch; 357 uint32_t layers_length; 358 uint32_t max_start_address; 359 uint32_t main_mem_usage; 360 uint32_t output_count; 330 typedef struct 331 { 332 uint32_t version; 333 uint32_t flags; 334 uint32_t arch; 335 uint32_t layers_length; 336 uint32_t max_start_address; 337 uint32_t main_mem_usage; 338 uint32_t output_count; 361 339 } kpu_kmodel_header_t; 362 340 363 341 typedef struct 364 342 { 365 366 367 368 369 343 uint32_t version; 344 uint32_t flags; 345 uint32_t layers_length; 346 uint32_t max_start_address; 347 uint32_t layers_argument_start; 370 348 } kpu_model_header_t; 371 349 372 350 typedef struct 373 351 { 374 375 352 uint32_t address; 353 uint32_t size; 376 354 } kpu_model_output_t; 377 355 378 356 typedef enum 379 357 { 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 358 KL_INVALID = 0, 359 KL_ADD, 360 KL_QUANTIZED_ADD, 361 KL_GLOBAL_MAX_POOL2D, 362 KL_QUANTIZED_GLOBAL_MAX_POOL2D, 363 KL_GLOBAL_AVERAGE_POOL2D, 364 KL_QUANTIZED_GLOBAL_AVERAGE_POOL2D, 365 KL_MAX_POOL2D, 366 KL_QUANTIZED_MAX_POOL2D, 367 KL_AVERAGE_POOL2D, 368 KL_QUANTIZED_AVERAGE_POOL2D, 369 KL_QUANTIZE, 370 KL_DEQUANTIZE, 371 KL_REQUANTIZE, 372 KL_L2_NORMALIZATION, 373 KL_SOFTMAX, 374 KL_CONCAT, 375 KL_QUANTIZED_CONCAT, 376 KL_FULLY_CONNECTED, 377 KL_QUANTIZED_FULLY_CONNECTED, 378 KL_TENSORFLOW_FLATTEN, 379 KL_QUANTIZED_TENSORFLOW_FLATTEN, 380 KL_RESIZE_NEAREST_NEIGHBOR, 381 KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR, 382 KL_CHANNELWISE_DEQUANTIZE, 383 KL_LOGISTIC, 384 KL_K210_CONV = 10240, 385 KL_K210_ADD_PADDING, 386 KL_K210_REMOVE_PADDING, 387 KL_K210_UPLOAD 410 388 } kpu_model_layer_type_t; 411 389 412 390 typedef struct 413 391 { 414 415 392 uint32_t type; 393 uint32_t body_size; 416 394 } kpu_model_layer_header_t; 417 395 418 396 typedef enum 419 397 { 420 421 398 KLF_NONE = 0, 399 KLF_MAIN_MEM_OUT = 1 422 400 } kpu_model_layer_flags_t; 423 401 424 402 typedef enum 425 403 { 426 427 404 KLP_SAME = 0, 405 KLP_VALID = 1 428 406 } kpu_model_padding_t; 429 407 430 408 typedef enum 431 409 { 432 433 434 410 KLA_LINEAR = 0, 411 KLA_RELU = 1, 412 KLA_RELU6 = 2 435 413 } kpu_model_activation_t; 436 414 437 415 typedef struct 438 416 { 439 440 417 float scale; 418 float bias; 441 419 } kpu_model_quant_param_t; 442 420 443 421 typedef struct 444 422 { 445 446 447 423 uint32_t width; 424 uint32_t height; 425 uint32_t channels; 448 426 } kpu_model_shape_t; 449 427 450 428 typedef struct 451 429 { 452 453 430 uint32_t start; 431 uint32_t size; 454 432 } kpu_model_memory_range_t; 455 433 456 434 typedef struct 457 435 { 458 459 460 461 462 463 436 uint32_t flags; 437 uint32_t main_mem_out_address; 438 uint32_t layer_offset; 439 uint32_t weights_offset; 440 uint32_t bn_offset; 441 uint32_t act_offset; 464 442 } kpu_model_conv_layer_argument_t; 465 443 466 444 typedef struct 467 445 { 468 469 470 471 472 446 uint32_t flags; 447 uint32_t main_mem_in_a_address; 448 uint32_t main_mem_in_b_address; 449 uint32_t main_mem_out_address; 450 uint32_t count; 473 451 } kpu_model_add_layer_argument_t; 474 452 475 453 typedef struct 476 454 { 477 478 479 480 481 482 483 484 485 486 487 488 489 490 455 uint32_t flags; 456 uint32_t main_mem_in_a_address; 457 uint32_t main_mem_in_b_address; 458 uint32_t main_mem_out_address; 459 uint32_t count; 460 int32_t in_a_offset; 461 int32_t in_a_mul; 462 int32_t in_a_shift; 463 int32_t in_b_offset; 464 int32_t in_b_mul; 465 int32_t in_b_shift; 466 int32_t out_offset; 467 int32_t out_mul; 468 int32_t out_shift; 491 469 } kpu_model_quant_add_layer_argument_t; 492 470 493 471 typedef struct 494 472 { 495 496 497 498 499 473 uint32_t flags; 474 uint32_t main_mem_in_address; 475 uint32_t main_mem_out_address; 476 uint32_t kernel_size; 477 uint32_t channels; 500 478 } kpu_model_gap2d_layer_argument_t; 501 479 502 480 typedef struct 503 481 { 504 505 506 507 508 509 510 511 512 513 514 482 uint32_t flags; 483 uint32_t main_mem_in_address; 484 uint32_t main_mem_out_address; 485 kpu_model_shape_t in_shape; 486 kpu_model_shape_t out_shape; 487 uint32_t kernel_width; 488 uint32_t kernel_height; 489 uint32_t stride_width; 490 uint32_t stride_height; 491 uint32_t padding_width; 492 uint32_t padding_height; 515 493 } kpu_model_quant_max_pool2d_layer_argument_t; 516 494 517 495 typedef struct 518 496 { 519 520 521 522 523 524 525 526 527 528 529 530 497 uint32_t flags; 498 uint32_t main_mem_in_address; 499 uint32_t main_mem_out_address; 500 kpu_model_shape_t in_shape; 501 kpu_model_shape_t out_shape; 502 uint32_t kernel_width; 503 uint32_t kernel_height; 504 uint32_t stride_width; 505 uint32_t stride_height; 506 uint32_t padding_width; 507 uint32_t padding_height; 508 kpu_model_activation_t act; 531 509 } kpu_model_ave_pool2d_layer_argument_t; 532 510 533 511 typedef struct 534 512 { 535 536 537 538 539 513 uint32_t flags; 514 uint32_t main_mem_in_address; 515 uint32_t mem_out_address; 516 uint32_t count; 517 kpu_model_quant_param_t quant_param; 540 518 } kpu_model_quantize_layer_argument_t; 541 519 542 520 typedef struct 543 521 { 544 545 546 547 548 522 uint32_t flags; 523 uint32_t main_mem_in_address; 524 uint32_t main_mem_out_address; 525 uint32_t count; 526 kpu_model_quant_param_t quant_param; 549 527 } kpu_model_dequantize_layer_argument_t; 550 528 551 529 typedef struct 552 530 { 553 554 555 556 557 531 uint32_t flags; 532 uint32_t main_mem_in_address; 533 uint32_t main_mem_out_address; 534 uint32_t count; 535 uint8_t table[256]; 558 536 } kpu_model_requantize_layer_argument_t; 559 537 560 538 typedef struct 561 539 { 562 563 564 565 540 uint32_t flags; 541 uint32_t main_mem_in_address; 542 uint32_t kpu_mem_out_address; 543 uint32_t channels; 566 544 } kpu_model_add_padding_layer_argument_t; 567 545 568 546 typedef struct 569 547 { 570 571 572 573 548 uint32_t flags; 549 uint32_t main_mem_in_address; 550 uint32_t main_mem_out_address; 551 uint32_t channels; 574 552 } kpu_model_remove_padding_layer_argument_t; 575 553 576 554 typedef struct 577 555 { 578 579 580 581 582 583 556 uint32_t flags; 557 uint32_t main_mem_in_address; 558 uint32_t kpu_mem_out_address; 559 uint32_t width; 560 uint32_t height; 561 uint32_t channels; 584 562 } kpu_model_upload_layer_argument_t; 585 563 586 564 typedef struct 587 565 { 588 589 590 591 566 uint32_t flags; 567 uint32_t main_mem_in_address; 568 uint32_t main_mem_out_address; 569 uint32_t channels; 592 570 } kpu_model_l2_norm_layer_argument_t; 593 571 594 572 typedef struct 595 573 { 596 597 598 599 574 uint32_t flags; 575 uint32_t main_mem_in_address; 576 uint32_t main_mem_out_address; 577 uint32_t channels; 600 578 } kpu_model_softmax_layer_argument_t; 601 579 602 580 typedef struct 603 581 { 604 605 606 607 582 uint32_t flags; 583 uint32_t main_mem_out_address; 584 uint32_t input_count; 585 kpu_model_memory_range_t inputs_mem[0]; 608 586 } kpu_model_concat_layer_argument_t; 609 587 610 588 typedef struct 611 589 { 612 613 614 615 616 617 618 590 uint32_t flags; 591 uint32_t main_mem_in_address; 592 uint32_t main_mem_out_address; 593 uint32_t in_channels; 594 uint32_t out_channels; 595 kpu_model_activation_t act; 596 float weights[0]; 619 597 } kpu_model_fully_connected_layer_argument_t; 620 598 621 599 typedef struct 622 600 { 623 624 625 626 601 uint32_t flags; 602 uint32_t main_mem_in_address; 603 uint32_t main_mem_out_address; 604 kpu_model_shape_t shape; 627 605 } kpu_model_tf_flatten_layer_argument_t; 628 606 629 607 typedef struct 630 608 { 631 632 633 634 635 636 637 609 uint32_t flags; 610 uint32_t main_mem_in_address; 611 uint32_t main_mem_out_address; 612 kpu_model_shape_t in_shape; 613 uint32_t out_width; 614 uint32_t out_height; 615 uint32_t align_corners; 638 616 } kpu_model_resize_nearest_neighbor_layer_argument_t; 639 617 640 618 typedef struct 641 619 { 642 643 644 645 646 647 648 620 uint32_t flags; 621 uint32_t main_mem_in_address; 622 uint32_t main_mem_out_address; 623 kpu_model_shape_t in_shape; 624 uint32_t out_width; 625 uint32_t out_height; 626 uint32_t align_corners; 649 627 } kpu_model_quant_resize_nearest_neighbor_layer_argument_t; 650 628 651 629 typedef struct 652 630 { 653 654 655 656 657 658 631 uint32_t flags; 632 uint32_t main_mem_in_address; 633 uint32_t main_mem_out_address; 634 uint32_t channels; 635 uint32_t channel_size; 636 kpu_model_quant_param_t quant_params[0]; 659 637 } kpu_model_channelwise_dequant_argument_t; 660 638 661 639 typedef struct 662 640 { 663 664 665 666 641 uint32_t flags; 642 uint32_t main_mem_in_address; 643 uint32_t main_mem_out_address; 644 uint32_t channels; 667 645 } kpu_model_logistic_layer_argument_t; 668 646 … … 671 649 typedef struct 672 650 { 673 int is_nncase; 674 675 union 676 { 677 struct 678 { 679 const uint8_t *model_buffer; 680 uint8_t *main_buffer; 681 uint32_t output_count; 682 const kpu_model_output_t *outputs; 683 const kpu_model_layer_header_t *layer_headers; 684 const uint8_t *body_start; 685 uint32_t layers_length; 686 volatile uint32_t current_layer; 687 const uint8_t *volatile current_body; 688 dmac_channel_number_t dma_ch; 689 kpu_done_callback_t done_callback; 690 void *userdata; 691 }; 692 693 struct 694 { 695 void* nncase_ctx; 696 }; 697 }; 651 union 652 { 653 struct 654 { 655 const uint8_t *model_buffer; 656 uint8_t *main_buffer; 657 uint32_t output_count; 658 const kpu_model_output_t *outputs; 659 const kpu_model_layer_header_t *layer_headers; 660 const uint8_t *body_start; 661 uint32_t layers_length; 662 volatile uint32_t current_layer; 663 const uint8_t *volatile current_body; 664 dmac_channel_number_t dma_ch; 665 kpu_done_callback_t done_callback; 666 void *userdata; 667 }; 668 669 struct 670 { 671 void* nncase_ctx; 672 }; 673 }; 698 674 } kpu_model_context_t; 699 675 700 676 typedef struct 701 677 { 702 703 704 705 706 707 708 678 uint32_t weigths_offset; 679 uint32_t bn_offset; 680 uint32_t act_offset; 681 float input_scale; 682 float input_bias; 683 float output_scale; 684 float output_bias; 709 685 } kpu_model_layer_metadata_t; 710 686 711 687 typedef struct _quantize_param 712 688 { 713 714 689 float scale; 690 float bias; 715 691 } quantize_param_t; 716 692 717 693 extern volatile kpu_config_t *const kpu; 718 694 719 /** 720 * @brief Modle complier init kpu handler 721 * 722 * @param[in] task Kpu handler 723 * 724 * @return Kpu handler 725 */ 726 extern kpu_task_t *kpu_task_init(kpu_task_t *task); 727 728 /** 729 * @brief Kpu run for AI 730 * 731 * @param[in] task Kpu handler 732 * @param[in] dma_ch DMA for kpu 733 * @param[in] src The picture data 734 * @param[in] dest The result of kpu 735 * @param[in] callback The callback of kpu 736 * 737 * @return result 738 * - 0 Success 739 * - Other Fail.Kpu is busy. 740 */ 741 int kpu_run(kpu_task_t *task, dmac_channel_number_t dma_ch, const void *src, void *dest, plic_irq_callback_t callback); 742 743 /** 744 * @brief Get kpu result buf 745 * 746 * @param[in] task Kpu handler 747 * 748 * @return Kpu result buf 749 */ 750 uint8_t *kpu_get_output_buf(kpu_task_t *task); 751 752 /** 753 * @brief Release kpu output buf 754 * 755 * @param[in] output_buf Kpu output buf 756 * 757 */ 758 void kpu_release_output_buf(uint8_t *output_buf); 759 760 /** 761 * @brief Kpu run for AI 762 * 763 * @param[in] task Kpu handler 764 * 765 * @return result 766 * - 0 Success 767 * - Other Fail.Kpu is busy. 768 */ 769 int kpu_start(kpu_task_t *task); 770 771 /** 772 * @brief Initialize kpu handler 773 * 774 * @param[in] task Kpu handler 775 * 776 * @return result 777 * - 0 Success 778 * - Other Fail. 779 */ 780 int kpu_single_task_init(kpu_task_t *task); 781 782 /** 783 * @brief Uninitialize kpu handler 784 * 785 * @param[in] task Kpu handler 786 * 787 * @return result 788 * - 0 Success 789 * - Other Fail. 790 */ 791 int kpu_single_task_deinit(kpu_task_t *task); 792 793 /** 794 * @brief Load kmodel and init kpu task 795 * 796 * @param[in] task Kpu handler 797 * @param[in] buffer Kmodel 798 * @param[in] meta Test data 799 * 800 * @return result 801 * - 0 Success 802 * - Other Fail. 803 */ 804 int kpu_model_load_from_buffer(kpu_task_t *task, uint8_t *buffer, kpu_model_layer_metadata_t **meta); 805 806 /** 807 * @brief Kpu initialize 808 * 809 * @param[in] eight_bit_mode 0:16bit mode 1:8bit mode 810 * @param[in] callback Callback of kpu 811 * @param[in] userdata Data of callback 812 * 813 */ 814 void kpu_init(int eight_bit_mode, plic_irq_callback_t callback, void *userdata); 815 816 /** 817 * @brief Kpu input data by dma 818 * 819 * @param[in] layer Kpu task layer 820 * @param[in] src Image data 821 * @param[in] dma_ch Dmac channel 822 * @param[in] callback Dmac complete callback 823 * @param[in] userdata Data of callback 824 * 825 */ 826 void kpu_input_dma(const kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata); 827 828 /** 829 * @brief Kpu input data by cpu 830 * 831 * @param[in] layer Kpu task layer 832 * @param[in] src Image data 833 * @param[in] width Image width 834 * @param[in] height Image heigth 835 * @param[in] channels Color channel, RGB is 3 836 * 837 */ 838 void kpu_input_with_padding(kpu_layer_argument_t *layer, const uint8_t *src, int width, int height, int channels); 839 840 /** 841 * @brief Kpu run only one layer 842 * 843 * @param[in] layer Kpu task layer 844 * 845 */ 846 void kpu_conv2d(kpu_layer_argument_t *layer); 847 848 /** 849 * @brief Kpu run only one layer then get the result by dma 850 * 851 * @param[in] layer Kpu task layer 852 * @param[in] dma_ch Dmac channel 853 * @param[in] dest Result 854 * @param[in] callback Dmac complete callback 855 * @param[in] userdata Data of callback 856 * 857 */ 858 void kpu_conv2d_output(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata); 859 860 /** 861 * @brief Kpu pooling 862 * 863 * @param[in] src Source 864 * @param[in] src_param Source param 865 * @param[in] kernel_size Kernel size, 7*7 is 49 866 * @param[in] channels Channels 867 * @param[in] dest Dest 868 * @param[in] dest_param Dest param 869 * 870 */ 871 void kpu_global_average_pool(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, uint8_t *dest, const quantize_param_t *dest_param); 872 873 /** 874 * @brief Kpu pooling 875 * 876 * @param[in] src Source 877 * @param[in] src_param Source param 878 * @param[in] kernel_size Kernel size, 7*7 is 49 879 * @param[in] channels Channels 880 * @param[in] dest Dest 881 * 882 */ 883 void kpu_global_average_pool_float(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, float *dest); 884 885 /** 886 * @brief Kpu fullly connected by cpu 887 * 888 * @param[in] src Source 889 * @param[in] weights Weight 890 * @param[in] biases Biases 891 * @param[in] dest Dest 892 * @param[in] input_channels Input channels 893 * @param[in] output_channels Output channels 894 * 895 */ 896 void kpu_fully_connected(const float *src, const float *weights, const float *biases, float *dest, int input_channels, int output_channels); 897 898 /** 899 * @brief Kpu matrix multiplication 900 * 901 * @param[in] src Source 902 * @param[in] channels Channels 903 * @param[in] dest Dest 904 * @param[in] dest_param Dest param 905 * 906 */ 907 void kpu_matmul_end(const uint8_t *src, int channels, float *dest, const quantize_param_t *dest_param); 908 909 /** 910 * @brief Kpu dequantize 911 * 912 * @param[in] src Source 913 * @param[in] src_param Source param 914 * @param[in] count Dequantize count 915 * @param[in] dest Dest 916 * 917 */ 918 void kpu_dequantize(const uint8_t *src, const quantize_param_t *src_param, size_t count, float *dest); 919 920 /** 921 * @brief Kpu load kmodel 922 * 923 * @param[in] ctx Kmodel object 924 * @param[in] buffer Kmodel buffer 925 * 926 * @return result 927 * - 0 Success 928 * - Other Fail. 929 */ 695 ER kpu_init(kpu_model_context_t *ctx); 930 696 int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer); 931 932 /**933 * @brief Kpu free kmodel buffer934 *935 * @param[in] ctx kmodel object936 *937 */938 void kpu_model_free(kpu_model_context_t *ctx);939 940 /**941 * @brief Kpu load kmodel942 *943 * @param[in] ctx Kmodel object944 * @param[in] index Output index945 * @param[in] data Output data946 * @param[in] size Output data size947 *948 * @return result949 * - 0 Success950 * - Other Fail.951 */952 697 int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size); 953 954 /**955 * @brief Kpu run kmodel956 *957 * @param[in] ctx Kmodel object958 * @param[in] src Source data959 * @param[in] dma_ch Dma channel960 * @param[in] done_callback Kpu complete callback961 * @param[in] userdata Data of callback962 *963 * @return result964 * - 0 Success965 * - Other Fail.966 */967 698 int kpu_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata); 699 ER kpu_wait_done(kpu_model_context_t *ctx, TMO tmout); 968 700 969 701 #ifdef __cplusplus -
azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/region_layer.c
r453 r458 6 6 typedef struct 7 7 { 8 9 10 11 8 float x; 9 float y; 10 float w; 11 float h; 12 12 } box_t; 13 13 14 14 typedef struct 15 15 { 16 17 18 16 int index; 17 int class; 18 float **probs; 19 19 } sortable_box_t; 20 20 … … 22 22 int region_layer_init(region_layer_t *rl, int width, int height, int channels, int origin_width, int origin_height) 23 23 { 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 24 int flag = 0; 25 26 rl->coords = 4; 27 rl->image_width = 320; 28 rl->image_height = 240; 29 30 rl->classes = channels / 5 - 5; 31 rl->net_width = origin_width; 32 rl->net_height = origin_height; 33 rl->layer_width = width; 34 rl->layer_height = height; 35 rl->boxes_number = (rl->layer_width * rl->layer_height * rl->anchor_number); 36 rl->output_number = (rl->boxes_number * (rl->classes + rl->coords + 1)); 37 38 rl->output = malloc(rl->output_number * sizeof(float)); 39 if (rl->output == NULL) 40 { 41 flag = -1; 42 goto malloc_error; 43 } 44 rl->boxes = malloc(rl->boxes_number * sizeof(box_t)); 45 if (rl->boxes == NULL) 46 { 47 flag = -2; 48 goto malloc_error; 49 } 50 rl->probs_buf = malloc(rl->boxes_number * (rl->classes + 1) * sizeof(float)); 51 if (rl->probs_buf == NULL) 52 { 53 flag = -3; 54 goto malloc_error; 55 } 56 rl->probs = malloc(rl->boxes_number * sizeof(float *)); 57 if (rl->probs == NULL) 58 { 59 flag = -4; 60 goto malloc_error; 61 } 62 for (uint32_t i = 0; i < rl->boxes_number; i++) 63 rl->probs[i] = &(rl->probs_buf[i * (rl->classes + 1)]); 64 return 0; 65 65 malloc_error: 66 67 68 69 70 66 free(rl->output); 67 free(rl->boxes); 68 free(rl->probs_buf); 69 free(rl->probs); 70 return flag; 71 71 } 72 72 73 73 void region_layer_deinit(region_layer_t *rl) 74 74 { 75 76 77 78 75 free(rl->output); 76 free(rl->boxes); 77 free(rl->probs_buf); 78 free(rl->probs); 79 79 } 80 80 81 81 static inline float sigmoid(float x) 82 82 { 83 83 return 1.f / (1.f + expf(-x)); 84 84 } 85 85 86 86 static void activate_array(region_layer_t *rl, int index, int n) 87 87 { 88 89 90 91 92 88 float *output = &rl->output[index]; 89 float *input = &rl->input[index]; 90 91 for (int i = 0; i < n; ++i) 92 output[i] = sigmoid(input[i]); 93 93 } 94 94 95 95 static int entry_index(region_layer_t *rl, int location, int entry) 96 96 { 97 98 99 100 101 97 int wh = rl->layer_width * rl->layer_height; 98 int n = location / wh; 99 int loc = location % wh; 100 101 return n * wh * (rl->coords + rl->classes + 1) + entry * wh + loc; 102 102 } 103 103 104 104 static void softmax(region_layer_t *rl, float *input, int n, int stride, float *output) 105 105 { 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 106 int i; 107 float diff; 108 float e; 109 float sum = 0; 110 float largest_i = input[0]; 111 112 for (i = 0; i < n; ++i) 113 { 114 if (input[i * stride] > largest_i) 115 largest_i = input[i * stride]; 116 } 117 118 for (i = 0; i < n; ++i) { 119 diff = input[i * stride] - largest_i; 120 e = expf(diff); 121 sum += e; 122 output[i * stride] = e; 123 } 124 for (i = 0; i < n; ++i) 125 output[i * stride] /= sum; 126 126 } 127 127 128 128 static void softmax_cpu(region_layer_t *rl, float *input, int n, int batch, int batch_offset, int groups, int stride, float *output) 129 129 { 130 131 132 133 134 135 130 int g, b; 131 132 for (b = 0; b < batch; ++b) { 133 for (g = 0; g < groups; ++g) 134 softmax(rl, input + b * batch_offset + g, n, stride, output + b * batch_offset + g); 135 } 136 136 } 137 137 138 138 static void forward_region_layer(region_layer_t *rl) 139 139 { 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 140 int index; 141 142 for (index = 0; index < rl->output_number; index++) 143 rl->output[index] = rl->input[index]; 144 145 for (int n = 0; n < rl->anchor_number; ++n) 146 { 147 index = entry_index(rl, n * rl->layer_width * rl->layer_height, 0); 148 activate_array(rl, index, 2 * rl->layer_width * rl->layer_height); 149 index = entry_index(rl, n * rl->layer_width * rl->layer_height, 4); 150 activate_array(rl, index, rl->layer_width * rl->layer_height); 151 } 152 153 index = entry_index(rl, 0, rl->coords + 1); 154 softmax_cpu(rl, rl->input + index, rl->classes, rl->anchor_number, 155 rl->output_number / rl->anchor_number, rl->layer_width * rl->layer_height, 156 rl->layer_width * rl->layer_height, rl->output + index); 157 157 } 158 158 159 159 static void correct_region_boxes(region_layer_t *rl, box_t *boxes) 160 160 { 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 161 uint32_t net_width = rl->net_width; 162 uint32_t net_height = rl->net_height; 163 uint32_t image_width = rl->image_width; 164 uint32_t image_height = rl->image_height; 165 uint32_t boxes_number = rl->boxes_number; 166 int new_w = 0; 167 int new_h = 0; 168 169 if (((float)net_width / image_width) < 170 ((float)net_height / image_height)) { 171 new_w = net_width; 172 new_h = (image_height * net_width) / image_width; 173 } else { 174 new_h = net_height; 175 new_w = (image_width * net_height) / image_height; 176 } 177 for (int i = 0; i < boxes_number; ++i) { 178 box_t b = boxes[i]; 179 180 b.x = (b.x - (net_width - new_w) / 2. / net_width) / 181 ((float)new_w / net_width); 182 b.y = (b.y - (net_height - new_h) / 2. / net_height) / 183 ((float)new_h / net_height); 184 b.w *= (float)net_width / new_w; 185 b.h *= (float)net_height / new_h; 186 boxes[i] = b; 187 } 188 188 } 189 189 190 190 static box_t get_region_box(float *x, float *biases, int n, int index, int i, int j, int w, int h, int stride) 191 191 { 192 193 194 195 196 197 198 192 volatile box_t b; 193 194 b.x = (i + x[index + 0 * stride]) / w; 195 b.y = (j + x[index + 1 * stride]) / h; 196 b.w = expf(x[index + 2 * stride]) * biases[2 * n] / w; 197 b.h = expf(x[index + 3 * stride]) * biases[2 * n + 1] / h; 198 return b; 199 199 } 200 200 201 201 static void get_region_boxes(region_layer_t *rl, float *predictions, float **probs, box_t *boxes) 202 202 { 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 203 uint32_t layer_width = rl->layer_width; 204 uint32_t layer_height = rl->layer_height; 205 uint32_t anchor_number = rl->anchor_number; 206 uint32_t classes = rl->classes; 207 uint32_t coords = rl->coords; 208 float threshold = rl->threshold; 209 210 for (int i = 0; i < layer_width * layer_height; ++i) 211 { 212 int row = i / layer_width; 213 int col = i % layer_width; 214 215 for (int n = 0; n < anchor_number; ++n) 216 { 217 int index = n * layer_width * layer_height + i; 218 219 for (int j = 0; j < classes; ++j) 220 probs[index][j] = 0; 221 int obj_index = entry_index(rl, n * layer_width * layer_height + i, coords); 222 int box_index = entry_index(rl, n * layer_width * layer_height + i, 0); 223 float scale = predictions[obj_index]; 224 225 boxes[index] = get_region_box(predictions, rl->anchor, n, box_index, col, row, 226 layer_width, layer_height, layer_width * layer_height); 227 228 float max = 0; 229 230 for (int j = 0; j < classes; ++j) 231 { 232 int class_index = entry_index(rl, n * layer_width * layer_height + i, coords + 1 + j); 233 float prob = scale * predictions[class_index]; 234 235 probs[index][j] = (prob > threshold) ? prob : 0; 236 if (prob > max) 237 max = prob; 238 } 239 probs[index][classes] = max; 240 } 241 } 242 correct_region_boxes(rl, boxes); 243 243 } 244 244 245 245 static int nms_comparator(void *pa, void *pb) 246 246 { 247 248 249 250 251 252 253 254 255 247 sortable_box_t a = *(sortable_box_t *)pa; 248 sortable_box_t b = *(sortable_box_t *)pb; 249 float diff = a.probs[a.index][b.class] - b.probs[b.index][b.class]; 250 251 if (diff < 0) 252 return 1; 253 else if (diff > 0) 254 return -1; 255 return 0; 256 256 } 257 257 258 258 static float overlap(float x1, float w1, float x2, float w2) 259 259 { 260 261 262 263 264 265 266 267 260 float l1 = x1 - w1/2; 261 float l2 = x2 - w2/2; 262 float left = l1 > l2 ? l1 : l2; 263 float r1 = x1 + w1/2; 264 float r2 = x2 + w2/2; 265 float right = r1 < r2 ? r1 : r2; 266 267 return right - left; 268 268 } 269 269 270 270 static float box_intersection(box_t a, box_t b) 271 271 { 272 273 274 275 276 277 272 float w = overlap(a.x, a.w, b.x, b.w); 273 float h = overlap(a.y, a.h, b.y, b.h); 274 275 if (w < 0 || h < 0) 276 return 0; 277 return w * h; 278 278 } 279 279 280 280 static float box_union(box_t a, box_t b) 281 281 { 282 283 284 285 282 float i = box_intersection(a, b); 283 float u = a.w * a.h + b.w * b.h - i; 284 285 return u; 286 286 } 287 287 288 288 static float box_iou(box_t a, box_t b) 289 289 { 290 290 return box_intersection(a, b) / box_union(a, b); 291 291 } 292 292 293 293 static void do_nms_sort(region_layer_t *rl, box_t *boxes, float **probs) 294 294 { 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 295 uint32_t boxes_number = rl->boxes_number; 296 uint32_t classes = rl->classes; 297 float nms_value = rl->nms_value; 298 int i, j, k; 299 sortable_box_t s[boxes_number]; 300 301 for (i = 0; i < boxes_number; ++i) 302 { 303 s[i].index = i; 304 s[i].class = 0; 305 s[i].probs = probs; 306 } 307 308 for (k = 0; k < classes; ++k) 309 { 310 for (i = 0; i < boxes_number; ++i) 311 s[i].class = k; 312 qsort(s, boxes_number, sizeof(sortable_box_t), nms_comparator); 313 for (i = 0; i < boxes_number; ++i) 314 { 315 if (probs[s[i].index][k] == 0) 316 continue; 317 box_t a = boxes[s[i].index]; 318 319 for (j = i + 1; j < boxes_number; ++j) 320 { 321 box_t b = boxes[s[j].index]; 322 323 if (box_iou(a, b) > nms_value) 324 probs[s[j].index][k] = 0; 325 } 326 } 327 } 328 328 } 329 329 330 330 static int max_index(float *a, int n) 331 331 { 332 333 334 335 336 337 338 339 340 341 342 343 332 int i, max_i = 0; 333 float max = a[0]; 334 335 for (i = 1; i < n; ++i) 336 { 337 if (a[i] > max) 338 { 339 max = a[i]; 340 max_i = i; 341 } 342 } 343 return max_i; 344 344 } 345 345 346 346 static void region_layer_output(region_layer_t *rl, obj_info_t *obj_info) 347 347 { 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 348 uint32_t obj_number = 0; 349 uint32_t image_width = rl->image_width; 350 uint32_t image_height = rl->image_height; 351 uint32_t boxes_number = rl->boxes_number; 352 float threshold = rl->threshold; 353 box_t *boxes = (box_t *)rl->boxes; 354 355 for (int i = 0; i < rl->boxes_number; ++i) 356 { 357 int class = max_index(rl->probs[i], rl->classes); 358 float prob = rl->probs[i][class]; 359 360 if (prob > threshold) 361 { 362 box_t *b = boxes + i; 363 obj_info->obj[obj_number].x1 = b->x * image_width - (b->w * image_width / 2); 364 obj_info->obj[obj_number].y1 = b->y * image_height - (b->h * image_height / 2); 365 obj_info->obj[obj_number].x2 = b->x * image_width + (b->w * image_width / 2); 366 obj_info->obj[obj_number].y2 = b->y * image_height + (b->h * image_height / 2); 367 obj_info->obj[obj_number].class_id = class; 368 obj_info->obj[obj_number].prob = prob; 369 obj_number++; 370 } 371 } 372 obj_info->obj_number = obj_number; 373 373 } 374 374 375 375 void region_layer_run(region_layer_t *rl, obj_info_t *obj_info) 376 376 { 377 378 379 380 377 forward_region_layer(rl); 378 get_region_boxes(rl, rl->output, rl->probs, rl->boxes); 379 do_nms_sort(rl, rl->boxes, rl->probs); 380 // region_layer_output(rl, obj_info); 381 381 } 382 382 383 383 void region_layer_draw_boxes(region_layer_t *rl, callback_draw_box callback) 384 384 { 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 } 385 uint32_t image_width = rl->image_width; 386 uint32_t image_height = rl->image_height; 387 float threshold = rl->threshold; 388 box_t *boxes = (box_t *)rl->boxes; 389 390 for (int i = 0; i < rl->boxes_number; ++i) 391 { 392 int class = max_index(rl->probs[i], rl->classes); 393 float prob = rl->probs[i][class]; 394 395 if (prob > threshold) 396 { 397 box_t *b = boxes + i; 398 uint32_t x1 = b->x * image_width - (b->w * image_width / 2); 399 uint32_t y1 = b->y * image_height - (b->h * image_height / 2); 400 uint32_t x2 = b->x * image_width + (b->w * image_width / 2); 401 uint32_t y2 = b->y * image_height + (b->h * image_height / 2); 402 callback(x1, y1, x2, y2, class, prob); 403 } 404 } 405 } -
azure_iot_hub_riscv/trunk/app_iothub_client/src/command.c
r453 r458 92 92 void 93 93 digitalWrite(uint8_t Pin, int dwVal){ 94 94 int8_t gpio_pin = gpio_get_gpiohno(Pin, false); 95 95 96 97 98 96 if( gpio_pin >= 0){ 97 gpio_set_pin(TADR_GPIOHS_BASE, (uint8_t)gpio_pin, dwVal); 98 } 99 99 } 100 100 -
azure_iot_hub_riscv/trunk/app_iothub_client/src/envcmd.c
r453 r458 1 1 /* 2 * TOPPERS/ASP Kernel 3 * Toyohashi Open Platform for Embedded Real-Time Systems/ 4 * Advanced Standard Profile Kernel 5 * 6 * Copyright (C) 2000-2003 by Embedded and Real-Time Systems Laboratory 7 * Toyohashi Univ. of Technology, JAPAN 8 * Copyright (C) 2004-2012 by Embedded and Real-Time Systems Laboratory 9 * Graduate School of Information Science, Nagoya Univ., JAPAN 10 * 11 * ãLì ÒÍCȺÌ(1)`(4)Ìðð½·êÉÀèC{\tgEF 12 * Ai{\tgEFAðüϵ½àÌðÜÞDȺ¯¶jðgpE¡»Eü 13 * ÏEÄzziȺCpÆÄÔj·é±Æð³Åø·éD 14 * (1) {\tgEFAð\[XR[hÌ`Åp·éêÉÍCãLÌì 15 * \¦C±Ìpð¨æÑºLÌ³ÛØKèªC»ÌÜÜÌ`Å\[ 16 * XR[hÉÜÜêĢ鱯D 17 * (2) {\tgEFAðCCu`®ÈÇC¼Ì\tgEFAJÉg 18 * pÅ«é`ÅÄzz·éêÉÍCÄzzɺ¤hL 19 gip 20 * Ò}j 21 AÈÇjÉCãLÌì \¦C±Ìpð¨æÑºL 22 * Ì³ÛØKèðfÚ·é±ÆD 23 * (3) {\tgEFAðC@íÉgÝÞÈÇC¼Ì\tgEFAJÉg 24 * pūȢ`ÅÄzz·éêÉÍCÌ¢¸ê©Ìðð½·± 25 * ÆD 26 * (a) Äzzɺ¤hL 27 gipÒ}j 28 AÈÇjÉCãLÌ 29 * ì \¦C±Ìpð¨æÑºLÌ³ÛØKèðfÚ·é±ÆD 30 * (b) ÄzzÌ`ÔðCÊÉèßéû@ÉæÁÄCTOPPERSvWFNgÉ 31 * ñ·é±ÆD 32 * (4) {\tgEFAÌpÉæè¼ÚIܽÍÔÚIɶ¶é¢©Èé¹ 33 * Q©çàCãLì Ò¨æÑTOPPERSvWFNgðÆÓ·é±ÆD 34 * ܽC{\tgEFAÌ[UܽÍGh[U©çÌ¢©Èé 35 * RÉîÿ©çàCãLì Ò¨æÑTOPPERSvWFNgð 36 * ÆÓ·é±ÆD 37 * 38 * {\tgEFAÍC³ÛØÅñ³êÄ¢éàÌÅ éDãLì Ò¨ 39 * æÑTOPPERSvWFNgÍC{\tgEFAÉÖµÄCÁèÌgpÚI 40 * ÉηéK«àÜßÄC¢©ÈéÛØàsíÈ¢DܽC{\tgEF 41 * AÌpÉæè¼ÚIܽÍÔÚIɶ¶½¢©Èé¹QÉÖµÄàC» 42 * ÌÓCðíÈ¢D 43 * 44 * $Id$ 45 */ 2 * TOPPERS/ASP Kernel 3 * Toyohashi Open Platform for Embedded Real-Time Systems/ 4 * Advanced Standard Profile Kernel 5 * 6 * Copyright (C) 2000-2003 by Embedded and Real-Time Systems Laboratory 7 * Toyohashi Univ. of Technology, JAPAN 8 * Copyright (C) 2004-2012 by Embedded and Real-Time Systems Laboratory 9 * Graduate School of Information Science, Nagoya Univ., JAPAN 10 * 11 * 上記著作権者は,以下の(1)~(4)の条件を満たす場合に限り,本ソフトウェ 12 * ア(本ソフトウェアを改変したものを含む.以下同じ)を使用・複製・改 13 * 変・再配布(以下,利用と呼ぶ)することを無償で許諾する. 14 * (1) 本ソフトウェアをソースコードの形で利用する場合には,上記の著作 15 * 権表示,この利用条件および下記の無保証規定が,そのままの形でソー 16 * スコード中に含まれていること. 17 * (2) 本ソフトウェアを,ライブラリ形式など,他のソフトウェア開発に使 18 * 用できる形で再配布する場合には,再配布に伴うドキュメント(利用 19 * 者マニュアルなど)に,上記の著作権表示,この利用条件および下記 20 * の無保証規定を掲載すること. 21 * (3) 本ソフトウェアを,機器に組み込むなど,他のソフトウェア開発に使 22 * 用できない形で再配布する場合には,次のいずれかの条件を満たすこ 23 * と. 24 * (a) 再配布に伴うドキュメント(利用者マニュアルなど)に,上記の著 25 * 作権表示,この利用条件および下記の無保証規定を掲載すること. 26 * (b) 再配布の形態を,別に定める方法によって,TOPPERSプロジェクトに 27 * 報告すること. 28 * (4) 本ソフトウェアの利用により直接的または間接的に生じるいかなる損 29 * 害からも,上記著作権者およびTOPPERSプロジェクトを免責すること. 30 * また,本ソフトウェアのユーザまたはエンドユーザからのいかなる理 31 * 由に基づく請求からも,上記著作権者およびTOPPERSプロジェクトを 32 * 免責すること. 33 * 34 * 本ソフトウェアは,無保証で提供されているものである.上記著作権者お 35 * よびTOPPERSプロジェクトは,本ソフトウェアに関して,特定の使用目的 36 * に対する適合性も含めて,いかなる保証も行わない.また,本ソフトウェ 37 * アの利用により直接的または間接的に生じたいかなる損害に関しても,そ 38 * の責任を負わない. 39 * 40 * $Id$ 41 */ 46 42 #include <stdio.h> 47 43 #include <stdlib.h> … … 222 218 } 223 219 224 225 220 int set_cs_main(int argc, char **argv) 226 221 { … … 285 280 return 0; 286 281 } 282 287 283 int clear_proxy_main(int argc, char **argv) 288 284 { -
azure_iot_hub_riscv/trunk/app_iothub_client/src/esp_at_socket.c
r453 r458 1 1 /* 2 * TOPPERS/ASP Kernel3 * Toyohashi Open Platform for Embedded Real-Time Systems/4 * Advanced Standard Profile Kernel5 *6 * Copyright (C) 2000-2003 by Embedded and Real-Time Systems Laboratory7 * Toyohashi Univ. of Technology, JAPAN8 * Copyright (C) 2004-2012 by Embedded and Real-Time Systems Laboratory9 * Graduate School of Information Science, Nagoya Univ., JAPAN10 *11 * 上記著作権者は,以下の(1)~(4)の条件を満たす場合に限り,本ソフトウェ12 * ア(本ソフトウェアを改変したものを含む.以下同じ)を使用・複製・改13 * 変・再配布(以下,利用と呼ぶ)することを無償で許諾する.14 * (1) 本ソフトウェアをソースコードの形で利用する場合には,上記の著作15 * 権表示,この利用条件および下記の無保証規定が,そのままの形でソー16 * スコード中に含まれていること.17 * (2) 本ソフトウェアを,ライブラリ形式など,他のソフトウェア開発に使18 * 用できる形で再配布する場合には,再配布に伴うドキュメント(利用19 * 者マニュアルなど)に,上記の著作権表示,この利用条件および下記20 * の無保証規定を掲載すること.21 * (3) 本ソフトウェアを,機器に組み込むなど,他のソフトウェア開発に使22 * 用できない形で再配布する場合には,次のいずれかの条件を満たすこ23 * と.24 * (a) 再配布に伴うドキュメント(利用者マニュアルなど)に,上記の著25 * 作権表示,この利用条件および下記の無保証規定を掲載すること.26 * (b) 再配布の形態を,別に定める方法によって,TOPPERSプロジェクトに27 * 報告すること.28 * (4) 本ソフトウェアの利用により直接的または間接的に生じるいかなる損29 * 害からも,上記著作権者およびTOPPERSプロジェクトを免責すること.30 * また,本ソフトウェアのユーザまたはエンドユーザからのいかなる理31 * 由に基づく請求からも,上記著作権者およびTOPPERSプロジェクトを32 * 免責すること.33 *34 * 本ソフトウェアは,無保証で提供されているものである.上記著作権者お35 * よびTOPPERSプロジェクトは,本ソフトウェアに関して,特定の使用目的36 * に対する適合性も含めて,いかなる保証も行わない.また,本ソフトウェ37 * アの利用により直接的または間接的に生じたいかなる損害に関しても,そ38 * の責任を負わない.39 *40 * $Id$41 */2 * TOPPERS/ASP Kernel 3 * Toyohashi Open Platform for Embedded Real-Time Systems/ 4 * Advanced Standard Profile Kernel 5 * 6 * Copyright (C) 2000-2003 by Embedded and Real-Time Systems Laboratory 7 * Toyohashi Univ. of Technology, JAPAN 8 * Copyright (C) 2004-2012 by Embedded and Real-Time Systems Laboratory 9 * Graduate School of Information Science, Nagoya Univ., JAPAN 10 * 11 * 上記著作権者は,以下の(1)~(4)の条件を満たす場合に限り,本ソフトウェ 12 * ア(本ソフトウェアを改変したものを含む.以下同じ)を使用・複製・改 13 * 変・再配布(以下,利用と呼ぶ)することを無償で許諾する. 14 * (1) 本ソフトウェアをソースコードの形で利用する場合には,上記の著作 15 * 権表示,この利用条件および下記の無保証規定が,そのままの形でソー 16 * スコード中に含まれていること. 17 * (2) 本ソフトウェアを,ライブラリ形式など,他のソフトウェア開発に使 18 * 用できる形で再配布する場合には,再配布に伴うドキュメント(利用 19 * 者マニュアルなど)に,上記の著作権表示,この利用条件および下記 20 * の無保証規定を掲載すること. 21 * (3) 本ソフトウェアを,機器に組み込むなど,他のソフトウェア開発に使 22 * 用できない形で再配布する場合には,次のいずれかの条件を満たすこ 23 * と. 24 * (a) 再配布に伴うドキュメント(利用者マニュアルなど)に,上記の著 25 * 作権表示,この利用条件および下記の無保証規定を掲載すること. 26 * (b) 再配布の形態を,別に定める方法によって,TOPPERSプロジェクトに 27 * 報告すること. 28 * (4) 本ソフトウェアの利用により直接的または間接的に生じるいかなる損 29 * 害からも,上記著作権者およびTOPPERSプロジェクトを免責すること. 30 * また,本ソフトウェアのユーザまたはエンドユーザからのいかなる理 31 * 由に基づく請求からも,上記著作権者およびTOPPERSプロジェクトを 32 * 免責すること. 33 * 34 * 本ソフトウェアは,無保証で提供されているものである.上記著作権者お 35 * よびTOPPERSプロジェクトは,本ソフトウェアに関して,特定の使用目的 36 * に対する適合性も含めて,いかなる保証も行わない.また,本ソフトウェ 37 * アの利用により直接的または間接的に生じたいかなる損害に関しても,そ 38 * の責任を負わない. 39 * 40 * $Id$ 41 */ 42 42 #include <stddef.h> 43 43 #include <stdbool.h> … … 306 306 { 307 307 struct tm tm = { 308 309 310 311 312 313 308 0, /* tm_sec */ 309 0, /* tm_min */ 310 0, /* tm_hour */ 311 1, /* tm_mday */ 312 0, /* tm_mon */ 313 2020 - 1900, /* tm_year */ 314 314 }; 315 315 MINIMUM_YEAR = mktime(&tm); -
azure_iot_hub_riscv/trunk/app_iothub_client/src/esp_at_socket.h
r453 r458 1 1 /* 2 * TOPPERS/ASP Kernel3 * Toyohashi Open Platform for Embedded Real-Time Systems/4 * Advanced Standard Profile Kernel5 *6 * Copyright (C) 2000-2003 by Embedded and Real-Time Systems Laboratory7 * Toyohashi Univ. of Technology, JAPAN8 * Copyright (C) 2004-2012 by Embedded and Real-Time Systems Laboratory9 * Graduate School of Information Science, Nagoya Univ., JAPAN10 *11 * 上記著作権者は,以下の(1)~(4)の条件を満たす場合に限り,本ソフトウェ12 * ア(本ソフトウェアを改変したものを含む.以下同じ)を使用・複製・改13 * 変・再配布(以下,利用と呼ぶ)することを無償で許諾する.14 * (1) 本ソフトウェアをソースコードの形で利用する場合には,上記の著作15 * 権表示,この利用条件および下記の無保証規定が,そのままの形でソー16 * スコード中に含まれていること.17 * (2) 本ソフトウェアを,ライブラリ形式など,他のソフトウェア開発に使18 * 用できる形で再配布する場合には,再配布に伴うドキュメント(利用19 * 者マニュアルなど)に,上記の著作権表示,この利用条件および下記20 * の無保証規定を掲載すること.21 * (3) 本ソフトウェアを,機器に組み込むなど,他のソフトウェア開発に使22 * 用できない形で再配布する場合には,次のいずれかの条件を満たすこ23 * と.24 * (a) 再配布に伴うドキュメント(利用者マニュアルなど)に,上記の著25 * 作権表示,この利用条件および下記の無保証規定を掲載すること.26 * (b) 再配布の形態を,別に定める方法によって,TOPPERSプロジェクトに27 * 報告すること.28 * (4) 本ソフトウェアの利用により直接的または間接的に生じるいかなる損29 * 害からも,上記著作権者およびTOPPERSプロジェクトを免責すること.30 * また,本ソフトウェアのユーザまたはエンドユーザからのいかなる理31 * 由に基づく請求からも,上記著作権者およびTOPPERSプロジェクトを32 * 免責すること.33 *34 * 本ソフトウェアは,無保証で提供されているものである.上記著作権者お35 * よびTOPPERSプロジェクトは,本ソフトウェアに関して,特定の使用目的36 * に対する適合性も含めて,いかなる保証も行わない.また,本ソフトウェ37 * アの利用により直接的または間接的に生じたいかなる損害に関しても,そ38 * の責任を負わない.39 *40 * $Id$41 */2 * TOPPERS/ASP Kernel 3 * Toyohashi Open Platform for Embedded Real-Time Systems/ 4 * Advanced Standard Profile Kernel 5 * 6 * Copyright (C) 2000-2003 by Embedded and Real-Time Systems Laboratory 7 * Toyohashi Univ. of Technology, JAPAN 8 * Copyright (C) 2004-2012 by Embedded and Real-Time Systems Laboratory 9 * Graduate School of Information Science, Nagoya Univ., JAPAN 10 * 11 * 上記著作権者は,以下の(1)~(4)の条件を満たす場合に限り,本ソフトウェ 12 * ア(本ソフトウェアを改変したものを含む.以下同じ)を使用・複製・改 13 * 変・再配布(以下,利用と呼ぶ)することを無償で許諾する. 14 * (1) 本ソフトウェアをソースコードの形で利用する場合には,上記の著作 15 * 権表示,この利用条件および下記の無保証規定が,そのままの形でソー 16 * スコード中に含まれていること. 17 * (2) 本ソフトウェアを,ライブラリ形式など,他のソフトウェア開発に使 18 * 用できる形で再配布する場合には,再配布に伴うドキュメント(利用 19 * 者マニュアルなど)に,上記の著作権表示,この利用条件および下記 20 * の無保証規定を掲載すること. 21 * (3) 本ソフトウェアを,機器に組み込むなど,他のソフトウェア開発に使 22 * 用できない形で再配布する場合には,次のいずれかの条件を満たすこ 23 * と. 24 * (a) 再配布に伴うドキュメント(利用者マニュアルなど)に,上記の著 25 * 作権表示,この利用条件および下記の無保証規定を掲載すること. 26 * (b) 再配布の形態を,別に定める方法によって,TOPPERSプロジェクトに 27 * 報告すること. 28 * (4) 本ソフトウェアの利用により直接的または間接的に生じるいかなる損 29 * 害からも,上記著作権者およびTOPPERSプロジェクトを免責すること. 30 * また,本ソフトウェアのユーザまたはエンドユーザからのいかなる理 31 * 由に基づく請求からも,上記著作権者およびTOPPERSプロジェクトを 32 * 免責すること. 33 * 34 * 本ソフトウェアは,無保証で提供されているものである.上記著作権者お 35 * よびTOPPERSプロジェクトは,本ソフトウェアに関して,特定の使用目的 36 * に対する適合性も含めて,いかなる保証も行わない.また,本ソフトウェ 37 * アの利用により直接的または間接的に生じたいかなる損害に関しても,そ 38 * の責任を負わない. 39 * 40 * $Id$ 41 */ 42 42 43 43 #ifndef _ESP_AT_SOCKET_H_ -
azure_iot_hub_riscv/trunk/app_iothub_client/src/kpu_main.c
r453