Changeset 458 for azure_iot_hub_riscv/trunk/app_iothub_client

azure_iot_hub_riscv/trunk/app_iothub_client/.vscode/tasks.json

r453	r458
149	149	"label": "write app_iothub_client",
150	150	"command": "/C/Python38/python.exe",
151		"args": [
	151	"args": [
152	152	"../tools/kflash/kflash.py",
153	153	"-p",

azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/atomic.h

-              r453
+              r458
 #define SPINLOCK_INIT \
     {                 \
              \
+    }
+        {                 \
+             \
+        }
 #define CORELOCK_INIT          \
     {                          \
         .lock = SPINLOCK_INIT, \
         .count = 0,            \
         .core = -1             \
+    }
+        {                          \
+                .lock = SPINLOCK_INIT, \
+                .count = 0,            \
+                .core = -1             \
+        }
 /* Defination of memory barrier macro */
 #define mb()                          \
     {                                 \
         asm volatile("fence" ::       \
                          : "memory"); \
+    }
+        {                                 \
+                asm volatile("fence" ::       \
+                                                 : "memory"); \
+        }
 #define atomic_set(ptr, val) (*(volatile typeof(*(ptr)) *)(ptr) = val)
 …
 typedef struct _spinlock
+{
     int lock;
+        int lock;
 } spinlock_t;
 typedef struct _semaphore
+{
     spinlock_t lock;
     int count;
     int waiting;
+        spinlock_t lock;
+        int count;
+        int waiting;
 } semaphore_t;
 typedef struct _corelock
+{
     spinlock_t lock;
     int count;
     int core;
+        spinlock_t lock;
+        int count;
+        int core;
 } corelock_t;
 static inline int spinlock_trylock(spinlock_t *lock)
+{
     int res = atomic_swap(&lock->lock, -1);
     /* Use memory barrier to keep coherency */
     mb();
     return res;
+        int res = atomic_swap(&lock->lock, -1);
+        /* Use memory barrier to keep coherency */
+        mb();
+        return res;
+}
 static inline void spinlock_lock(spinlock_t *lock)
+{
     while(spinlock_trylock(lock))
+        ;
+        while(spinlock_trylock(lock))
+                ;
+}
 static inline void spinlock_unlock(spinlock_t *lock)
+{
     /* Use memory barrier to keep coherency */
     mb();
     atomic_set(&lock->lock, 0);
     asm volatile("nop");
+        /* Use memory barrier to keep coherency */
+        mb();
+        atomic_set(&lock->lock, 0);
+        asm volatile("nop");
+}
 static inline void semaphore_signal(semaphore_t *semaphore, int i)
+{
     spinlock_lock(&(semaphore->lock));
     semaphore->count += i;
     spinlock_unlock(&(semaphore->lock));
+        spinlock_lock(&(semaphore->lock));
+        semaphore->count += i;
+        spinlock_unlock(&(semaphore->lock));
+}
 static inline void semaphore_wait(semaphore_t *semaphore, int i)
+{
     atomic_add(&(semaphore->waiting), 1);
     while(1)
+    {
         spinlock_lock(&(semaphore->lock));
         if(semaphore->count >= i)
+        {
             semaphore->count -= i;
             atomic_add(&(semaphore->waiting), -1);
             spinlock_unlock(&(semaphore->lock));
             break;
+        }
         spinlock_unlock(&(semaphore->lock));
+    }
+        atomic_add(&(semaphore->waiting), 1);
+        while(1)
+        {
+                spinlock_lock(&(semaphore->lock));
+                if(semaphore->count >= i)
+                {
+                        semaphore->count -= i;
+                        atomic_add(&(semaphore->waiting), -1);
+                        spinlock_unlock(&(semaphore->lock));
+                        break;
+                }
+                spinlock_unlock(&(semaphore->lock));
+        }
+}
 static inline int semaphore_count(semaphore_t *semaphore)
+{
     int res = 0;
     spinlock_lock(&(semaphore->lock));
     res = semaphore->count;
     spinlock_unlock(&(semaphore->lock));
     return res;
+        int res = 0;
+        spinlock_lock(&(semaphore->lock));
+        res = semaphore->count;
+        spinlock_unlock(&(semaphore->lock));
+        return res;
+}
 static inline int semaphore_waiting(semaphore_t *semaphore)
+{
     return atomic_read(&(semaphore->waiting));
+        return atomic_read(&(semaphore->waiting));
+}
 static inline int corelock_trylock(corelock_t *lock)
+{
     int res = 0;
     unsigned long core;
     asm volatile("csrr %0, mhartid;"
                  : "=r"(core));
     if(spinlock_trylock(&lock->lock))
+    {
         return -1;
+    }
     if(lock->count == 0)
+    {
         /* First time get lock */
         lock->count++;
         lock->core = core;
         res = 0;
     } else if(lock->core == core)
+    {
         /* Same core get lock */
         lock->count++;
         res = 0;
     } else
+    {
         /* Different core get lock */
         res = -1;
+    }
     spinlock_unlock(&lock->lock);
     return res;
+        int res = 0;
+        unsigned long core;
+        asm volatile("csrr %0, mhartid;"
+                                 : "=r"(core));
+        if(spinlock_trylock(&lock->lock))
+        {
+                return -1;
+        }
+        if(lock->count == 0)
+        {
+                /* First time get lock */
+                lock->count++;
+                lock->core = core;
+                res = 0;
+        } else if(lock->core == core)
+        {
+                /* Same core get lock */
+                lock->count++;
+                res = 0;
+        } else
+        {
+                /* Different core get lock */
+                res = -1;
+        }
+        spinlock_unlock(&lock->lock);
+        return res;
+}
 static inline void corelock_lock(corelock_t *lock)
+{
     unsigned long core;
     asm volatile("csrr %0, mhartid;"
                  : "=r"(core));
     spinlock_lock(&lock->lock);
     if(lock->count == 0)
+    {
         /* First time get lock */
         lock->count++;
         lock->core = core;
     } else if(lock->core == core)
+    {
         /* Same core get lock */
         lock->count++;
     } else
+    {
         /* Different core get lock */
         spinlock_unlock(&lock->lock);
         do
+        {
             while(atomic_read(&lock->count))
+                ;
         } while(corelock_trylock(lock));
         return;
+    }
     spinlock_unlock(&lock->lock);
+        unsigned long core;
+        asm volatile("csrr %0, mhartid;"
+                                 : "=r"(core));
+        spinlock_lock(&lock->lock);
+        if(lock->count == 0)
+        {
+                /* First time get lock */
+                lock->count++;
+                lock->core = core;
+        } else if(lock->core == core)
+        {
+                /* Same core get lock */
+                lock->count++;
+        } else
+        {
+                /* Different core get lock */
+                spinlock_unlock(&lock->lock);
+                do
+                {
+                        while(atomic_read(&lock->count))
+                                ;
+                } while(corelock_trylock(lock));
+                return;
+        }
+        spinlock_unlock(&lock->lock);
+}
 static inline void corelock_unlock(corelock_t *lock)
+{
     unsigned long core;
     asm volatile("csrr %0, mhartid;"
                  : "=r"(core));
     spinlock_lock(&lock->lock);
     if(lock->core == core)
+    {
         /* Same core release lock */
         lock->count--;
         if(lock->count <= 0)
+        {
             lock->core = -1;
             lock->count = 0;
+        }
     } else
+    {
         /* Different core release lock */
         spinlock_unlock(&lock->lock);
         register unsigned long a7 asm("a7") = 93;
         register unsigned long a0 asm("a0") = 0;
         register unsigned long a1 asm("a1") = 0;
         register unsigned long a2 asm("a2") = 0;
         asm volatile("scall"
                      : "+r"(a0)
                      : "r"(a1), "r"(a2), "r"(a7));
+    }
     spinlock_unlock(&lock->lock);
+        unsigned long core;
+        asm volatile("csrr %0, mhartid;"
+                                 : "=r"(core));
+        spinlock_lock(&lock->lock);
+        if(lock->core == core)
+        {
+                /* Same core release lock */
+                lock->count--;
+                if(lock->count <= 0)
+                {
+                        lock->core = -1;
+                        lock->count = 0;
+                }
+        } else
+        {
+                /* Different core release lock */
+                spinlock_unlock(&lock->lock);
+                register unsigned long a7 asm("a7") = 93;
+                register unsigned long a0 asm("a0") = 0;
+                register unsigned long a1 asm("a1") = 0;
+                register unsigned long a2 asm("a2") = 0;
+                asm volatile("scall"
+                                         : "+r"(a0)
+                                         : "r"(a1), "r"(a2), "r"(a7));
+        }
+        spinlock_unlock(&lock->lock);
+}

azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/incbin.h

-              r453
+              r458
 #include <limits.h>
 # define INCBIN_ALIGNMENT_INDEX 7
+# define INCBIN_ALIGNMENT_INDEX 8
 /* Lookup table of (1 << n) where `n' is `INCBIN_ALIGNMENT_INDEX' */
 …
 #define INCBIN_ALIGN_SHIFT_6 64
 #define INCBIN_ALIGN_SHIFT_7 128
+#define INCBIN_ALIGN_SHIFT_8 256
 /* Actual alignment value */
 #define INCBIN_ALIGNMENT \
     INCBIN_CONCATENATE( \
         INCBIN_CONCATENATE(INCBIN_ALIGN_SHIFT, _), \
         INCBIN_ALIGNMENT_INDEX)
+        INCBIN_CONCATENATE( \
+                INCBIN_CONCATENATE(INCBIN_ALIGN_SHIFT, _), \
+                INCBIN_ALIGNMENT_INDEX)
 /* Stringize */
 #define INCBIN_STR(X) \
     #X
+        #X
 #define INCBIN_STRINGIZE(X) \
     INCBIN_STR(X)
+        INCBIN_STR(X)
 /* Concatenate */
 #define INCBIN_CAT(X, Y) \
     X ## Y
+        X ## Y
 #define INCBIN_CONCATENATE(X, Y) \
     INCBIN_CAT(X, Y)
+        INCBIN_CAT(X, Y)
 /* Deferred macro expansion */
 #define INCBIN_EVAL(X) \
+    X
+        X
 #define INCBIN_INVOKE(N, ...) \
     INCBIN_EVAL(N(__VA_ARGS__))
+        INCBIN_EVAL(N(__VA_ARGS__))
 /* Green Hills uses a different directive for including binary data */
 …
 #ifndef _MSC_VER
 #  define INCBIN_ALIGN \
     __attribute__((aligned(INCBIN_ALIGNMENT)))
+        __attribute__((aligned(INCBIN_ALIGNMENT)))
 #else
 #  define INCBIN_ALIGN __declspec(align(INCBIN_ALIGNMENT))
 …
 #if defined(__arm__) || /* GNU C and RealView */ \
     defined(__arm) || /* Diab */ \
     defined(_ARM) /* ImageCraft */
+        defined(__arm) || /* Diab */ \
+        defined(_ARM) /* ImageCraft */
 #  define INCBIN_ARM
 #endif
 …
 #if defined(__APPLE__)
 /* The directives are different for Apple branded compilers */
 #  define INCBIN_SECTION         ".data\n"
+#  define INCBIN_SECTION         ".rodata\n"
 #  define INCBIN_GLOBAL(NAME)    ".globl " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n"
 #  define INCBIN_INT             ".long "
 …
 #  define INCBIN_TYPE(...)
 #else
 #  define INCBIN_SECTION         ".section .data\n"
+#  define INCBIN_SECTION         ".section .rodata\n"
 #  define INCBIN_GLOBAL(NAME)    ".global " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n"
 #  define INCBIN_INT             ".int "
 …
 /* Style lookup: returning identifier */
 #define INCBIN_STYLE_IDENT(TYPE) \
     INCBIN_CONCATENATE( \
         INCBIN_STYLE_, \
         INCBIN_CONCATENATE( \
             INCBIN_EVAL(INCBIN_STYLE), \
             INCBIN_CONCATENATE(_, TYPE)))
+        INCBIN_CONCATENATE( \
+                INCBIN_STYLE_, \
+                INCBIN_CONCATENATE( \
+                        INCBIN_EVAL(INCBIN_STYLE), \
+                        INCBIN_CONCATENATE(_, TYPE)))
 /* Style lookup: returning string literal */
 #define INCBIN_STYLE_STRING(TYPE) \
     INCBIN_STRINGIZE( \
         INCBIN_STYLE_IDENT(TYPE)) \
+        INCBIN_STRINGIZE( \
+                INCBIN_STYLE_IDENT(TYPE)) \
 /* Generate the global labels by indirectly invoking the macro with our style
  * type and concatenating the name against them. */
 #define INCBIN_GLOBAL_LABELS(NAME, TYPE) \
     INCBIN_INVOKE( \
         INCBIN_GLOBAL, \
         INCBIN_CONCATENATE( \
             NAME, \
             INCBIN_INVOKE( \
                 INCBIN_STYLE_IDENT, \
                 TYPE))) \
     INCBIN_INVOKE( \
         INCBIN_TYPE, \
         INCBIN_CONCATENATE( \
             NAME, \
             INCBIN_INVOKE( \
                 INCBIN_STYLE_IDENT, \
                 TYPE)))
+        INCBIN_INVOKE( \
+                INCBIN_GLOBAL, \
+                INCBIN_CONCATENATE( \
+                        NAME, \
+                        INCBIN_INVOKE( \
+                                INCBIN_STYLE_IDENT, \
+                                TYPE))) \
+        INCBIN_INVOKE( \
+                INCBIN_TYPE, \
+                INCBIN_CONCATENATE( \
+                        NAME, \
+                        INCBIN_INVOKE( \
+                                INCBIN_STYLE_IDENT, \
+                                TYPE)))
 /**
 …
  */
 #define INCBIN_EXTERN(NAME) \
     INCBIN_EXTERNAL INCBIN_ALIGN unsigned char \
         INCBIN_CONCATENATE( \
             INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
             INCBIN_STYLE_IDENT(DATA))[]; \
     INCBIN_EXTERNAL INCBIN_ALIGN unsigned char * \
     INCBIN_CONCATENATE( \
         INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
         INCBIN_STYLE_IDENT(END)); \
     INCBIN_EXTERNAL unsigned int \
         INCBIN_CONCATENATE( \
             INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
             INCBIN_STYLE_IDENT(SIZE))
+        INCBIN_EXTERNAL INCBIN_ALIGN unsigned char \
+                INCBIN_CONCATENATE( \
+                        INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+                        INCBIN_STYLE_IDENT(DATA))[]; \
+        INCBIN_EXTERNAL INCBIN_ALIGN unsigned char * \
+        INCBIN_CONCATENATE( \
+                INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+                INCBIN_STYLE_IDENT(END)); \
+        INCBIN_EXTERNAL unsigned int \
+                INCBIN_CONCATENATE( \
+                        INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+                        INCBIN_STYLE_IDENT(SIZE))
 /**
 …
 #ifdef _MSC_VER
 #define INCBIN(NAME, FILENAME) \
     INCBIN_EXTERN(NAME)
+        INCBIN_EXTERN(NAME)
 #else
 #define INCBIN(NAME, FILENAME) \
     __asm__(INCBIN_SECTION \
             INCBIN_GLOBAL_LABELS(NAME, DATA) \
             INCBIN_ALIGN_HOST \
             INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) ":\n" \
             INCBIN_MACRO " \"" FILENAME "\"\n" \
             INCBIN_GLOBAL_LABELS(NAME, END) \
             INCBIN_ALIGN_BYTE \
             INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) ":\n" \
                 INCBIN_BYTE "1\n" \
             INCBIN_GLOBAL_LABELS(NAME, SIZE) \
             INCBIN_ALIGN_HOST \
             INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(SIZE) ":\n" \
                 INCBIN_INT INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) " - " \
                            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) "\n" \
     ); \
     INCBIN_EXTERN(NAME)
 #endif
 #endif
+        __asm__(INCBIN_SECTION \
+                        INCBIN_GLOBAL_LABELS(NAME, DATA) \
+                        INCBIN_ALIGN_HOST \
+                        INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) ":\n" \
+                        INCBIN_MACRO " \"" FILENAME "\"\n" \
+                        INCBIN_GLOBAL_LABELS(NAME, END) \
+                        INCBIN_ALIGN_BYTE \
+                        INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) ":\n" \
+                                INCBIN_BYTE "1\n" \
+                        INCBIN_GLOBAL_LABELS(NAME, SIZE) \
+                        INCBIN_ALIGN_HOST \
+                        INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(SIZE) ":\n" \
+                                INCBIN_INT INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) " - " \
+                                                   INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) "\n" \
+        ); \
+        INCBIN_EXTERN(NAME)
+#endif
+#endif

azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/kpu.c

-              r453
+              r458
 #include "utils.h"
 #include "kpu_main.h"
+#include "kernel_cfg.h"
 #define sil_orw_mem(a, b) sil_wrw_mem((a), sil_rew_mem(a) | (b))
-void sysctl_enable_irq(void)
+{
-    set_csr(mie, MIP_MEIP);
-    set_csr(mstatus, MSTATUS_MIE);
+}
-void sysctl_disable_irq(void)
+{
-    clear_csr(mie, MIP_MEIP);
-    clear_csr(mstatus, MSTATUS_MIE);
+}
 uint64_t sysctl_get_time_us(void)
+{
     uint64_t v_cycle = read_cycle();
     return v_cycle * 1000000 / SYSCTRL_CLOCK_FREQ_IN0;
+        uint64_t v_cycle = read_cycle();
+        return v_cycle * 1000000 / SYSCTRL_CLOCK_FREQ_IN0;
+}
 static int is_memory(uintptr_t address)
+{
     enum
+    {
         mem_len = 6 * 1024 * 1024,
         mem_no_cache_len = 8 * 1024 * 1024,
     };
     return ((address >= 0x80000000) && (address < 0x80000000 + mem_len)) || ((address >= 0x40000000) && (address < 0x40000000 + mem_no_cache_len)) || (address == 0x50450040);
+        enum
+        {
+                mem_len = 6 * 1024 * 1024,
+                mem_no_cache_len = 8 * 1024 * 1024,
+        };
+        return ((address >= 0x80000000) && (address < 0x80000000 + mem_len)) || ((address >= 0x40000000) && (address < 0x40000000 + mem_no_cache_len)) || (address == 0x50450040);
+}
 uint32_t is_memory_cache(uintptr_t address)
+{
     #define MEM_CACHE_LEN (6 * 1024 * 1024)
     return ((address >= 0x80000000) && (address < 0x80000000 + MEM_CACHE_LEN));
+#define MEM_CACHE_LEN (6 * 1024 * 1024)
+        return ((address >= 0x80000000) && (address < 0x80000000 + MEM_CACHE_LEN));
+}
 int plic_irq_enable(INTNO irq_number)
+{
     if (irq_number != INTNO_AI)
         return -1;
     ena_int(irq_number);
     return 0;
+        if (irq_number != INTNO_AI)
+                return -1;
+        ena_int(irq_number);
+        return 0;
+}
 int plic_set_priority(INTNO irq_number, uint32_t priority)
+{
     if (irq_number != INTNO_AI)
         return -1;
     set_ipriority(irq_number, priority);
     return 0;
+        if (irq_number != INTNO_AI)
+                return -1;
+        set_ipriority(irq_number, priority);
+        return 0;
+}
 …
 void plic_irq_register(INTNO irq, plic_irq_callback_t callback, void *ctx)
+{
+    ER ret;
+    if (irq != INTNO_AI)
+        return;
+    ret = loc_cpu();
+    ai_done_callback = callback;
+    ai_done_ctx = ctx;
+    if (ret == E_OK)
+        unl_cpu();
+        if (irq != INTNO_AI)
+                return;
+        dis_int(INTNO_AI);
+        ai_done_callback = callback;
+        ai_done_ctx = ctx;
+        ena_int(INTNO_AI);
+}
 void ai_done_isr(intptr_t exinf)
+{
+    sysctl_disable_irq();
+    if (ai_done_callback != NULL){
+        ai_done_callback(ai_done_ctx);
+    }
+    sysctl_enable_irq();
+        dis_int(INTNO_AI);
+        if (ai_done_callback != NULL)
+        {
+                ai_done_callback(ai_done_ctx);
+        }
+        ena_int(INTNO_AI);
+}
 …
 void kpu_dmac_irq_register(dmac_channel_number_t channel_num,
+    plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
+{
+    ER ret;
+    if (channel_num != AI_DMA_CH)
+        return;
+    set_ipriority(INTNO_DMAAI, priority);
+    ret = loc_cpu();
+    ai_dma_done_callback = dmac_callback;
+    ai_dma_done_ctx = ctx;
+    if (ret == E_OK)
+        unl_cpu();
+                                                   plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
+{
+        if (channel_num != AI_DMA_CH)
+                return;
+        //set_ipriority(INTNO_DMAAI, priority);
+        dis_int(INTNO_DMAAI);
+        ai_dma_done_callback = dmac_callback;
+        ai_dma_done_ctx = ctx;
+        ena_int(INTNO_DMAAI);
+}
 void ai_dma_done_isr(DMA_Handle_t *dma)
+{
+    sysctl_disable_irq();
+    if (ai_dma_done_callback != NULL) {
+        ai_dma_done_callback(ai_dma_done_ctx);
+    }
+    sysctl_enable_irq();
+        dis_int(INTNO_DMAAI);
+        if (ai_dma_done_callback != NULL)
+        {
+                ai_dma_done_callback(ai_dma_done_ctx);
+        }
+        ena_int(INTNO_DMAAI);
+}
 void dmac_set_irq(dmac_channel_number_t channel_num,
+    plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
+{
+    ER ret;
+    if (channel_num != AI_DMA_CH)
+        return;
+    set_ipriority(INTNO_DMAAI, priority);
+    ret = loc_cpu();
+    ai_dma_done_callback = dmac_callback;
+    ai_dma_done_ctx = ctx;
+    if (ret == E_OK)
+        unl_cpu();
+                                  plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
+{
+        if (channel_num != AI_DMA_CH)
+                return;
+        //set_ipriority(INTNO_DMAAI, priority);
+        dis_int(INTNO_DMAAI);
+        ai_dma_done_callback = dmac_callback;
+        ai_dma_done_ctx = ctx;
+        ena_int(INTNO_DMAAI);
+}
 …
 void dmac_set_single_mode(dmac_channel_number_t channel_num,
     const void *src, void *dest, uint8_t src_inc,
     uint8_t dest_inc,
     uint8_t dmac_burst_size,
     uint8_t dmac_trans_width,
     size_t block_size)
+{
     if (channel_num != AI_DMA_CH)
         return;
     DMA_Handle_t *hdma = &g_ai_hdma;
     int mem_type_src = is_memory((uintptr_t)src), mem_type_dest = is_memory((uintptr_t)dest);
     uint8_t flow_control;
     if(mem_type_src == 0 && mem_type_dest == 0)
         flow_control = DMA_PERIPH_TO_PERIPH;
     else if(mem_type_src == 1 && mem_type_dest == 0)
         flow_control = DMA_MEMORY_TO_PERIPH;
     else if(mem_type_src == 0 && mem_type_dest == 1)
         flow_control = DMA_PERIPH_TO_MEMORY;
     else
         flow_control = DMA_MEMORY_TO_MEMORY;
     hdma->Init.Direction    = flow_control;     /* DMA転送方向 */
     hdma->Init.SrcHandShake = (mem_type_src ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE);     /* ソースハンドシェイク */
     hdma->Init.DrcHandShake = (mem_type_dest ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE);    /* デスティネーションハンドシェイク */
     hdma->Init.SrcInc       = src_inc;  /* ソースインクリメント設定 */
     hdma->Init.DstInc       = dest_inc; /* デスティネーションインクリメント設定 */
     hdma->Init.SrcTransWidth = dmac_trans_width;        /* ソース転送幅 */
     hdma->Init.DstTransWidth = dmac_trans_width;        /* デスティネーション転送幅 */
     hdma->Init.SrcBurstSize = dmac_burst_size;  /* ソースバーストサイズ */
     hdma->Init.DstBurstSize = dmac_burst_size;  /* デスティネーションバーストサイズ */
     dma_reset(hdma);
     dma_start(hdma, (uintptr_t)src, (uintptr_t)dest, block_size);
+                                                  const void *src, void *dest, uint8_t src_inc,
+                                                  uint8_t dest_inc,
+                                                  uint8_t dmac_burst_size,
+                                                  uint8_t dmac_trans_width,
+                                                  size_t block_size)
+{
+        if (channel_num != AI_DMA_CH)
+                return;
+        DMA_Handle_t *hdma = &g_ai_hdma;
+        int mem_type_src = is_memory((uintptr_t)src), mem_type_dest = is_memory((uintptr_t)dest);
+        uint8_t flow_control;
+        if (mem_type_src == 0 && mem_type_dest == 0)
+                flow_control = DMA_PERIPH_TO_PERIPH;
+        else if (mem_type_src == 1 && mem_type_dest == 0)
+                flow_control = DMA_MEMORY_TO_PERIPH;
+        else if (mem_type_src == 0 && mem_type_dest == 1)
+                flow_control = DMA_PERIPH_TO_MEMORY;
+        else
+                flow_control = DMA_MEMORY_TO_MEMORY;
+        hdma->Init.Direction = flow_control;                                                                                     /* DMA転送方向 */
+        hdma->Init.SrcHandShake = (mem_type_src ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE);  /* ソースハンドシェイク */
+        hdma->Init.DrcHandShake = (mem_type_dest ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* デスティネーションハンドシェイク */
+        hdma->Init.SrcInc = src_inc;                                                                                                     /* ソースインクリメント設定 */
+        hdma->Init.DstInc = dest_inc;                                                                                                    /* デスティネーションインクリメント設定 */
+        hdma->Init.SrcTransWidth = dmac_trans_width;                                                                     /* ソース転送幅 */
+        hdma->Init.DstTransWidth = dmac_trans_width;                                                                     /* デスティネーション転送幅 */
+        hdma->Init.SrcBurstSize = dmac_burst_size;                                                                               /* ソースバーストサイズ */
+        hdma->Init.DstBurstSize = dmac_burst_size;                                                                               /* デスティネーションバーストサイズ */
+        dma_reset(hdma);
+        dma_start(hdma, (uintptr_t)src, (uintptr_t)dest, block_size);
+}
 …
 static volatile uint32_t kpu_status;
-typedef struct kpu_context
+{
-    kpu_task_t kpu_task;
-    uint32_t kpu_status;
-} kpu_context_t;
-volatile kpu_context_t g_kpu_context;
-static int kpu_run_all_done(void *_task)
+{
-    atomic_swap(&g_kpu_context.kpu_status, 0);
-    kpu_task_t *task = (kpu_task_t *)_task;
-    task->callback(task);
-    return 0;
+}
-int kpu_continue(void *_task)
+{
-    kpu_task_t *task = (kpu_task_t *)_task;
-    int layer_burst_size = 1;
-    kpu->interrupt_clear.data = (kpu_config_interrupt_t){
-        .calc_done_int = 1,
-        .layer_cfg_almost_empty_int = 1,
-        .layer_cfg_almost_full_int = 1};
-    if(task->remain_layers_length == 0)
+    {
-        return 0;
+    }
-    if(task->remain_layers_length <= layer_burst_size)
+    {
-        for(uint32_t i = 0; i < task->remain_layers_length; i++)
+        {
-            kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
+        }
-        task->remain_layers_length = 0;
-    } else
+    {
-        for(uint32_t i = 0; i < layer_burst_size; i++)
+        {
-            kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
+        }
-        task->remain_layers += layer_burst_size;
-        task->remain_layers_length -= layer_burst_size;
+    }
-    return 0;
+}
-static int kpu_run_dma_output(uint32_t dma_ch, void *dst, uint32_t length, plic_irq_callback_t cb, void *_task)
+{
-    select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
-    kpu_dmac_irq_register(dma_ch, kpu_run_all_done, _task, 1);
-    dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), (void *)(dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
-                        DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (length + 7) / 8);
-    return 0;
+}
-static int kpu_run_dma_input_done_push_layers(void *_task)
+{
-    kpu_task_t *task = (kpu_task_t *)_task;
-    kpu->interrupt_clear.reg = 7;
-    dma_end(&g_ai_hdma);
-    kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
-        .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
-    kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
-        .eight_bit_mode = task->eight_bit_mode};
-    kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
-    kpu_run_dma_output(task->dma_ch, task->dst, last_layer->dma_parameter.data.dma_total_byte + 1, kpu_run_all_done, task);
-    kpu->interrupt_mask.data = (kpu_config_interrupt_t){
-        .calc_done_int = 0,
-        .layer_cfg_almost_empty_int = 0,
-        .layer_cfg_almost_full_int = 1};
-    kpu_continue(task);
-    return 0;
+}
-static void kpu_run_dma_input(uint32_t dma_ch, const void *src, plic_irq_callback_t cb, void *_task)
+{
-    kpu_task_t *task = _task;
-    kpu_layer_argument_t *first_layer = &task->layers[0];
-    uint64_t input_size = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1);
-    kpu_dmac_irq_register(dma_ch, cb, _task, 1);
-    dmac_set_single_mode(dma_ch, (void *)src, (void *)(AI_IO_BASE_ADDR), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
-                        DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
+}
-int kpu_run(kpu_task_t *v_task, dmac_channel_number_t dma_ch, const void *src, void *dest, plic_irq_callback_t callback)
+{
-    if(atomic_cas(&g_kpu_context.kpu_status, 0, 1))
-        return -1;
-    memcpy((void *)&g_kpu_context.kpu_task, v_task, sizeof(kpu_task_t));
-    kpu_task_t *task = (kpu_task_t *)&g_kpu_context.kpu_task;
-    kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
-    uint64_t output_size = last_layer->dma_parameter.data.dma_total_byte + 1;
-    last_layer->dma_parameter.data.send_data_out = 1;
-    last_layer->interrupt_enabe.data.int_en = 1;
-    task->dma_ch = dma_ch;
-    task->dst = dest;
-    task->dst_length = output_size;
-    task->callback = callback;
-    task->remain_layers_length = task->layers_length;
-    task->remain_layers = task->layers;
-    plic_set_priority(INTNO_AI, 1);
-    plic_irq_register(INTNO_AI, kpu_continue, task);
-    plic_irq_enable(INTNO_AI);
-    kpu_run_dma_input(dma_ch, src, kpu_run_dma_input_done_push_layers, task);
-    return 0;
+}
-uint8_t *kpu_get_output_buf(kpu_task_t *task)
+{
-    kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
-    size_t output_size = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8 * 8;
-    return malloc(output_size);
+}
-void kpu_release_output_buf(uint8_t *output_buf)
+{
-    if(output_buf != NULL)
-        free(output_buf);
+}
-static int kpu_done(void *ctx)
+{
-    atomic_swap(&kpu_status, 0);
-    kpu_task_t *task = (kpu_task_t *)ctx;
-    task->callback(task->ctx);
-    return 0;
+}
-static int kpu_config_input(void *ctx)
+{
-    kpu_task_t *task = (kpu_task_t *)ctx;
-    kpu->interrupt_clear.reg = 7;
-    if(task->remain_layers_length <= LAYER_BURST_SIZE)
+    {
-        for(uint32_t i = 0; i < task->remain_layers_length; i++)
+        {
-            kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
+        }
-        task->remain_layers_length = 0;
-        kpu->interrupt_mask.reg = 7;
-    } else
+    {
-        for(uint32_t i = 0; i < LAYER_BURST_SIZE; i++)
+        {
-            kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
-            kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
+        }
-        task->remain_layers += LAYER_BURST_SIZE;
-        task->remain_layers_length -= LAYER_BURST_SIZE;
+    }
-    return 0;
+}
-static void kpu_data_output(kpu_task_t *task)
+{
-    select_dma_channel(task->dma_ch, DMA_SELECT_AI_RX_REQ);
-    kpu_dmac_irq_register(task->dma_ch, kpu_done, task, 1);
-    dmac_set_single_mode(task->dma_ch, (void *)(&kpu->fifo_data_out), (void *)(task->dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
-                        DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, task->dst_length);
+}
-static int kpu_data_ready(void *ctx)
+{
-    kpu_task_t *task = (kpu_task_t *)ctx;
-    dma_end(&g_ai_hdma);
-    kpu_data_output(task);
-    kpu->eight_bit_mode.reg = task->eight_bit_mode;
-    kpu->interrupt_mask.reg = 7;
-    kpu->interrupt_clear.reg = 7;
-    kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
-        .fifo_full_threshold = 12, .fifo_empty_threshold = 1};
-    plic_set_priority(INTNO_AI, 2);
-    plic_irq_register(INTNO_AI, kpu_config_input, task);
-    plic_irq_enable(INTNO_AI);
-    kpu_config_input(task);
-    kpu->interrupt_mask.data = (kpu_config_interrupt_t){
-        .calc_done_int = 1,
-        .layer_cfg_almost_empty_int = 0,
-        .layer_cfg_almost_full_int = 1};
-    return 0;
+}
-static void kpu_data_input(kpu_task_t *task)
+{
-    if(task->src == NULL)
+    {
-        kpu_data_ready(task);
-        return;
+    }
-    kpu_dmac_irq_register(task->dma_ch, kpu_data_ready, task, 1);
-    kpu_layer_argument_t *layer = &task->layers[0];
-    dmac_set_single_mode(task->dma_ch, (void *)(uintptr_t)task->src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
-                        DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, task->src_length);
+}
-int kpu_single_task_init(kpu_task_t *task)
+{
-    /*
-    *  AIクロック有効化
-    */
-    sil_orw_mem((uint32_t *)(TADR_SYSCTL_BASE+TOFF_SYSCTL_CLK_EN_PERI), SYSCTL_CLK_EN_PERI_AI_CLK_EN);
-    kpu_layer_argument_t *first_layer = &task->layers[0];
-    kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
-    last_layer->dma_parameter.data.send_data_out = 1;
-    last_layer->interrupt_enabe.data.int_en = 1;
-    task->src_length = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1) / 8;
-    task->dst_length = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8;
-    task->dst = (uint64_t *)malloc(task->dst_length * 8);
-    if(task->dst == NULL)
-        return 1;
-    memset(task->dst, 0, task->dst_length * 8);
-    return 0;
+}
-int kpu_single_task_deinit(kpu_task_t *task)
+{
-    free(task->dst);
-    return 0;
+}
-int kpu_model_load_from_buffer(kpu_task_t *task, uint8_t *buffer, kpu_model_layer_metadata_t **meta)
+{
-    uintptr_t base_addr = (uintptr_t)buffer;
-    kpu_model_header_t *header = (kpu_model_header_t *)buffer;
-    kpu_model_layer_metadata_t *layer_meta = (kpu_model_layer_metadata_t *)(base_addr + sizeof(kpu_model_header_t));
-    kpu_layer_argument_t *layers = (kpu_layer_argument_t *)(base_addr + header->layers_argument_start);
-    if(header->version != 1)
-        return -1;
-    uint32_t layers_length = header->layers_length;
-    task->layers_length = layers_length;
-    task->eight_bit_mode = header->flags & 1;
-    task->layers = layers;
-    task->output_scale = layer_meta[layers_length - 1].output_scale;
-    task->output_bias = layer_meta[layers_length - 1].output_bias;
-    size_t i;
-    for(i = 0; i < layers_length; i++)
+    {
-        layers[i].kernel_load_cfg.data.para_start_addr = (uint64_t)(base_addr + layer_meta[i].weigths_offset);
-        layers[i].kernel_pool_type_cfg.data.bwsx_base_addr = (uint64_t)(base_addr + layer_meta[i].bn_offset);
-        layers[i].kernel_calc_type_cfg.data.active_addr = (uint64_t)(base_addr + layer_meta[i].act_offset);
+    }
-    if(meta)
-        *meta = layer_meta;
-    return 0;
+}
-int kpu_start(kpu_task_t *task)
+{
-    if(atomic_cas(&kpu_status, 0, 1))
-        return -1;
-    task->remain_layers_length = task->layers_length;
-    task->remain_layers = task->layers;
-    kpu_data_input(task);
-    return 0;
+}
 static void kpu_send_layer(const kpu_layer_argument_t *layer)
+{
+    kpu->layer_argument_fifo = layer->interrupt_enabe.reg;
+    kpu->layer_argument_fifo = layer->image_addr.reg;
+    kpu->layer_argument_fifo = layer->image_channel_num.reg;
+    kpu->layer_argument_fifo = layer->image_size.reg;
+    kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg;
+    kpu->layer_argument_fifo = layer->kernel_load_cfg.reg;
+    kpu->layer_argument_fifo = layer->kernel_offset.reg;
+    kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg;
+    kpu->layer_argument_fifo = layer->write_back_cfg.reg;
+    kpu->layer_argument_fifo = layer->conv_value.reg;
+    kpu->layer_argument_fifo = layer->conv_value2.reg;
+    kpu->layer_argument_fifo = layer->dma_parameter.reg;
+}
+void kpu_init(int eight_bit_mode, plic_irq_callback_t callback, void *userdata)
+{
+    kpu->interrupt_clear.reg = 7;
+    kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
+        .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
+    kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
+        .eight_bit_mode = eight_bit_mode};
+    kpu->interrupt_mask.data = (kpu_config_interrupt_t){
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 0,
+        .layer_cfg_almost_full_int = 1};
+    plic_set_priority(INTNO_AI, 1);
+    plic_irq_register(INTNO_AI, callback, userdata);
+    plic_irq_enable(INTNO_AI);
+        kpu->layer_argument_fifo = layer->interrupt_enabe.reg;
+        kpu->layer_argument_fifo = layer->image_addr.reg;
+        kpu->layer_argument_fifo = layer->image_channel_num.reg;
+        kpu->layer_argument_fifo = layer->image_size.reg;
+        kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg;
+        kpu->layer_argument_fifo = layer->kernel_load_cfg.reg;
+        kpu->layer_argument_fifo = layer->kernel_offset.reg;
+        kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg;
+        kpu->layer_argument_fifo = layer->write_back_cfg.reg;
+        kpu->layer_argument_fifo = layer->conv_value.reg;
+        kpu->layer_argument_fifo = layer->conv_value2.reg;
+        kpu->layer_argument_fifo = layer->dma_parameter.reg;
+}
 void kpu_input_dma(const kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata)
+{
     uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1);
     dmac_set_irq(dma_ch, callback, userdata, 1);
     dmac_set_single_mode(dma_ch, (void *)src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
                         DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
+        uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1);
+        dmac_set_irq(dma_ch, callback, userdata, 1);
+        dmac_set_single_mode(dma_ch, (void *)src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
+                                                 DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
+}
 static void kpu_conv2d_core(kpu_layer_argument_t *layer)
+{
     kpu_send_layer(layer);
+        kpu_send_layer(layer);
+}
 void kpu_conv2d(kpu_layer_argument_t *layer)
+{
+    kpu->interrupt_clear.data = (kpu_config_interrupt_t){
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 1,
+        .layer_cfg_almost_full_int = 1};
+    kpu->interrupt_mask.data = (kpu_config_interrupt_t){
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 0,
+        .layer_cfg_almost_full_int = 1};
+    kpu_conv2d_core(layer);
+}
+void kpu_conv2d_output(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata)
+{
+    kpu->interrupt_clear.data = (kpu_config_interrupt_t){
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 1,
+        .layer_cfg_almost_full_int = 1};
+    kpu->interrupt_mask.data = (kpu_config_interrupt_t){
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 1,
+        .layer_cfg_almost_full_int = 1};
+    layer->dma_parameter.data.send_data_out = 1;
+    select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
+    dmac_set_irq(dma_ch, callback, userdata, 1);
+    dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
+                        DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer->dma_parameter.data.dma_total_byte + 8) / 8);
+    kpu_conv2d_core(layer);
+}
+void kpu_conv2d_output_full_add(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint64_t *dest, plic_irq_callback_t callback, void *userdata)
+{
+    uint32_t channels = layer->image_channel_num.data.o_ch_num + 1;
+    layer->interrupt_enabe.data.full_add = 1;
+    kpu->interrupt_clear.data = (kpu_config_interrupt_t){
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 1,
+        .layer_cfg_almost_full_int = 1};
+    kpu->interrupt_mask.data = (kpu_config_interrupt_t){
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 1,
+        .layer_cfg_almost_full_int = 1};
+    layer->dma_parameter.data.send_data_out = 1;
+    select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
+    dmac_set_irq(dma_ch, callback, userdata, 1);
+    dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
+                        DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, channels);
+    kpu_conv2d_core(layer);
+}
+void kpu_add(const uint8_t *src1, const quantize_param_t *src1_param, const uint8_t *src2, const quantize_param_t *src2_param, size_t count, uint8_t *dest, const quantize_param_t *dest_param)
+{
+    quantize_param_t q1 = *src1_param, q2 = *src2_param, q3 = *dest_param;
+    size_t i;
+    for(i = 0; i < count; i++)
+    {
+        int value = ((*src1++ * q1.scale + q1.bias + *src2++ * q2.scale + q2.bias) - q3.bias) / q3.scale;
+        if(value < 0)
+            value = 0;
+        if(value > 0xFF)
+            value = 0xFF;
+        *dest++ = value;
+    }
+        kpu->interrupt_clear.data = (kpu_config_interrupt_t){
+                .calc_done_int = 1,
+                .layer_cfg_almost_empty_int = 1,
+                .layer_cfg_almost_full_int = 1};
+        kpu->interrupt_mask.data = (kpu_config_interrupt_t){
+                .calc_done_int = 1,
+                .layer_cfg_almost_empty_int = 0,
+                .layer_cfg_almost_full_int = 1};
+        kpu_conv2d_core(layer);
+}
 void kpu_global_average_pool(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, uint8_t *dest, const quantize_param_t *dest_param)
+{
+    quantize_param_t q1 = *src_param, q2 = *dest_param;
+    size_t oc, y, x;
+    if(((uintptr_t)dest) >= AI_IO_BASE_ADDR && ((uintptr_t)dest) < AI_IO_BASE_ADDR + 2 * 1024 * 1024)
+    {
+        uint32_t row_padding = 16;
+        uint32_t row_group = 4;
+        uint32_t row_length = 1;
+        uint32_t height = 4;
+        for(oc = 0; oc < channels; oc++)
+        {
+            uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
+            for(y = 0; y < 1; y++)
+            {
+                uint8_t *y_origin = channel_origin + y * row_length * 64;
+                for(x = 0; x < 1; x++)
+                {
+                    int64_t sum = 0;
+                    size_t i;
+                    for(i = 0; i < kernel_size; i++)
+                        sum += *src++;
+                    int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
+                    if(value < 0)
+                        value = 0;
+                    if(value > 0xFF)
+                        value = 0xFF;
+                    y_origin[x] = value;
+                }
+            }
+        }
+    } else
+    {
+        for(oc = 0; oc < channels; oc++)
+        {
+            int64_t sum = 0;
+            size_t i;
+            for(i = 0; i < kernel_size; i++)
+                sum += *src++;
+            int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
+            if(value < 0)
+                value = 0;
+            if(value > 0xFF)
+                value = 0xFF;
+            dest[oc] = value;
+        }
+    }
+        quantize_param_t q1 = *src_param, q2 = *dest_param;
+        size_t oc, y, x;
+        if (((uintptr_t)dest) >= AI_IO_BASE_ADDR && ((uintptr_t)dest) < AI_IO_BASE_ADDR + 2 * 1024 * 1024)
+        {
+                uint32_t row_padding = 16;
+                uint32_t row_group = 4;
+                uint32_t row_length = 1;
+                uint32_t height = 4;
+                for (oc = 0; oc < channels; oc++)
+                {
+                        uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
+                        for (y = 0; y < 1; y++)
+                        {
+                                uint8_t *y_origin = channel_origin + y * row_length * 64;
+                                for (x = 0; x < 1; x++)
+                                {
+                                        int64_t sum = 0;
+                                        size_t i;
+                                        for (i = 0; i < kernel_size; i++)
+                                                sum += *src++;
+                                        int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
+                                        if (value < 0)
+                                                value = 0;
+                                        if (value > 0xFF)
+                                                value = 0xFF;
+                                        y_origin[x] = value;
+                                }
+                        }
+                }
+        }
+        else
+        {
+                for (oc = 0; oc < channels; oc++)
+                {
+                        int64_t sum = 0;
+                        size_t i;
+                        for (i = 0; i < kernel_size; i++)
+                                sum += *src++;
+                        int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
+                        if (value < 0)
+                                value = 0;
+                        if (value > 0xFF)
+                                value = 0xFF;
+                        dest[oc] = value;
+                }
+        }
+}
 void kpu_global_average_pool_float(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, float *dest)
+{
+    quantize_param_t q = *src_param;
+    size_t oc;
+    for(oc = 0; oc < channels; oc++)
+    {
+        int64_t sum = 0;
+        size_t i;
+        for(i = 0; i < kernel_size; i++)
+            sum += *src++;
+        float value = (sum * q.scale + q.bias) / kernel_size;
+        dest[oc] = value;
+    }
+}
+void kpu_matmul_end(const uint8_t *src, int channels, float *dest, const quantize_param_t *dest_param)
+{
+    quantize_param_t q1 = *dest_param;
+    size_t i = 0;
+    for(i = 0; i < channels; i++)
+        *dest++ = src[i * 16] * q1.scale + q1.bias;
+}
+void kpu_fully_connected(const float *src, const float *weights, const float *biases, float *dest, int input_channels, int output_channels)
+{
+    int ic, oc;
+    for(oc = 0; oc < output_channels; oc++)
+    {
+        const float *c_weights = weights + oc * input_channels;
+        float sum = 0.0f;
+        for(ic = 0; ic < input_channels; ic++)
+            sum += src[ic] * c_weights[ic];
+        dest[oc] = sum + biases[oc];
+    }
+}
+void kpu_dequantize(const uint8_t *src, const quantize_param_t *src_param, size_t count, float *dest)
+{
+    quantize_param_t q1 = *src_param;
+    size_t i = 0;
+    for(i = 0; i < count; i++)
+        *dest++ = src[i] * q1.scale + q1.bias;
+}
+void kpu_input_with_padding(kpu_layer_argument_t *layer, const uint8_t *src, int width, int height, int channels)
+{
+    uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64);
+    size_t oc, y, x;
+    uint32_t row_padding;
+    uint32_t row_group;
+    uint32_t row_length;
+    if(width <= 16)
+    {
+        row_padding = 16;
+        row_group = 4;
+        row_length = 1;
+    } else if(width <= 32)
+    {
+        row_padding = 32;
+        row_group = 2;
+        row_length = 1;
+    } else
+    {
+        row_padding = 64;
+        row_group = 1;
+        row_length = (width + 63) / 64;
+    }
+    for(oc = 0; oc < channels; oc++)
+    {
+        uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
+        for(y = 0; y < height; y++)
+        {
+            uint8_t *y_origin = channel_origin + y * row_length * 64;
+            for(x = 0; x < width; x++)
+                y_origin[x] = *src++;
+        }
+    }
+}
+        quantize_param_t q = *src_param;
+        size_t oc;
+        for (oc = 0; oc < channels; oc++)
+        {
+                int64_t sum = 0;
+                size_t i;
+                for (i = 0; i < kernel_size; i++)
+                        sum += *src++;
+                float value = (sum * q.scale + q.bias) / kernel_size;
+                dest[oc] = value;
+        }
+}
 #if USE_CACHED_AI_RAM
 static void kpu_flush_cache(uint32_t addr, size_t lines)
+{
     size_t line;
     for(line = 0; line < lines; line++)
+    {
         const uint64_t *src = (const uint64_t *)(AI_RAM_BASE_ADDR + (addr + line) * 64);
         uint64_t *dest = (uint64_t *)(AI_IO_BASE_ADDR + (addr + line) * 64);
         size_t i;
         for(i = 0; i < 8; i++)
             dest[i] = src[i];
+    }
+        size_t line;
+        for (line = 0; line < lines; line++)
+        {
+                const uint64_t *src = (const uint64_t *)(AI_RAM_BASE_ADDR + (addr + line) * 64);
+                uint64_t *dest = (uint64_t *)(AI_IO_BASE_ADDR + (addr + line) * 64);
+                size_t i;
+                for (i = 0; i < 8; i++)
+                        dest[i] = src[i];
+        }
+}
 #endif
 static int64_t kpu_carry_shift(int64_t value, uint32_t shift)
+{
+    if(shift > 0)
+    {
+        value >>= shift - 1;
+        if(value & 0x1)
+        {
+            if(value < 0)
+                value = (value >> 1) - 1;
+            else
+                value = (value >> 1) + 1;
+        } else
+        {
+            value >>= 1;
+        }
+    }
+    return value;
+        if (shift > 0)
+        {
+                value >>= shift - 1;
+                if (value & 0x1)
+                {
+                        if (value < 0)
+                                value = (value >> 1) - 1;
+                        else
+                                value = (value >> 1) + 1;
+                }
+                else
+                {
+                        value >>= 1;
+                }
+        }
+        return value;
+}
 static void kpu_upload_core(size_t width, size_t height, size_t channels, const uint8_t *src, uint32_t kpu_addr)
+{
+    uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + kpu_addr * 64);
+    size_t oc, y, x;
+    uint32_t row_padding;
+    uint32_t row_group;
+    uint32_t row_length;
+    if(width <= 16)
+    {
+        row_padding = 16;
+        row_group = 4;
+        row_length = 1;
+    } else if(width <= 32)
+    {
+        row_padding = 32;
+        row_group = 2;
+        row_length = 1;
+    } else
+    {
+        row_padding = 64;
+        row_group = 1;
+        row_length = (width + 63) / 64;
+    }
+    if((uintptr_t)src % 8 == 0 && width % 8 == 0)
+    {
+        uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + kpu_addr * 64);
+        size_t oc, y, x;
+        uint32_t row_padding;
+        uint32_t row_group;
+        uint32_t row_length;
+        if (width <= 16)
+        {
+                row_padding = 16;
+                row_group = 4;
+                row_length = 1;
+        }
+        else if (width <= 32)
+        {
+                row_padding = 32;
+                row_group = 2;
+                row_length = 1;
+        }
+        else
+        {
+                row_padding = 64;
+                row_group = 1;
+                row_length = (width + 63) / 64;
+        }
+        if ((uintptr_t)src % 8 == 0 && width % 8 == 0)
+        {
 #define UPLOAD_BEGIN()                                                                                             \
     for(oc = 0; oc < channels; oc++)                                                                               \
     {                                                                                                              \
         uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; \
         for(y = 0; y < height; y++)                                                                                \
         {                                                                                                          \
             uint64_t *y_origin = (uint64_t *)(channel_origin + y * row_length * 64);
+        for (oc = 0; oc < channels; oc++)                                                                              \
+        {                                                                                                              \
+                uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; \
+                for (y = 0; y < height; y++)                                                                               \
+                {                                                                                                          \
+                        uint64_t *y_origin = (uint64_t *)(channel_origin + y * row_length * 64);
 #define UPLOAD_END() \
+    }                \
+    }
+        width /= 8;
+        const uint64_t *u64_src = (const uint64_t *)src;
+        if(width == 1)
+        {
+            UPLOAD_BEGIN()
+            y_origin[0] = *u64_src++;
+            UPLOAD_END()
+        } else if(width == 2)
+        {
+            UPLOAD_BEGIN()
+            {
+                y_origin[0] = *u64_src++;
+                y_origin[1] = *u64_src++;
+            }
+            UPLOAD_END()
+        } else if(width == 4)
+        {
+            UPLOAD_BEGIN()
+            {
+                y_origin[0] = *u64_src++;
+                y_origin[1] = *u64_src++;
+                y_origin[2] = *u64_src++;
+                y_origin[3] = *u64_src++;
+            }
+            UPLOAD_END()
+        } else
+        {
+            UPLOAD_BEGIN()
+            for(x = 0; x < width; x++)
+                y_origin[x] = *u64_src++;
+            UPLOAD_END()
+        }
+    } else
+    {
+        for(oc = 0; oc < channels; oc++)
+        {
+            uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
+            for(y = 0; y < height; y++)
+            {
+                uint8_t *y_origin = channel_origin + y * row_length * 64;
+                for(x = 0; x < width; x++)
+                    y_origin[x] = *src++;
+            }
+        }
+    }
+        }                \
+        }
+                width /= 8;
+                const uint64_t *u64_src = (const uint64_t *)src;
+                if (width == 1)
+                {
+                        UPLOAD_BEGIN()
+                        y_origin[0] = *u64_src++;
+                        UPLOAD_END()
+                }
+                else if (width == 2)
+                {
+                        UPLOAD_BEGIN()
+                        {
+                                y_origin[0] = *u64_src++;
+                                y_origin[1] = *u64_src++;
+                        }
+                        UPLOAD_END()
+                }
+                else if (width == 4)
+                {
+                        UPLOAD_BEGIN()
+                        {
+                                y_origin[0] = *u64_src++;
+                                y_origin[1] = *u64_src++;
+                                y_origin[2] = *u64_src++;
+                                y_origin[3] = *u64_src++;
+                        }
+                        UPLOAD_END()
+                }
+                else
+                {
+                        UPLOAD_BEGIN()
+                        for (x = 0; x < width; x++)
+                                y_origin[x] = *u64_src++;
+                        UPLOAD_END()
+                }
+        }
+        else
+        {
+                for (oc = 0; oc < channels; oc++)
+                {
+                        uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
+                        for (y = 0; y < height; y++)
+                        {
+                                uint8_t *y_origin = channel_origin + y * row_length * 64;
+                                for (x = 0; x < width; x++)
+                                        y_origin[x] = *src++;
+                        }
+                }
+        }
+}
 static void kpu_kmodel_input_with_padding(const kpu_layer_argument_t *layer, const uint8_t *src)
+{
     size_t width = layer->image_size.data.i_row_wid + 1;
     size_t height = layer->image_size.data.i_col_high + 1;
     size_t channels = layer->image_channel_num.data.i_ch_num + 1;
     kpu_upload_core(width, height, channels, src, layer->image_addr.data.image_src_addr);
+        size_t width = layer->image_size.data.i_row_wid + 1;
+        size_t height = layer->image_size.data.i_col_high + 1;
+        size_t channels = layer->image_channel_num.data.i_ch_num + 1;
+        kpu_upload_core(width, height, channels, src, layer->image_addr.data.image_src_addr);
+}
 static void kpu_kmodel_input_float(const float *src, float *dest, size_t count)
+{
     memcpy(dest, src, count * sizeof(float));
+        memcpy(dest, src, count * sizeof(float));
+}
 static void kpu_float_activation(float *data, size_t count, kpu_model_activation_t act)
+{
+    size_t i;
+    if(act == KLA_RELU)
+    {
+        for(i = 0; i < count; i++)
+            data[i] = max(data[i], 0);
+    } else if(act == KLA_RELU6)
+    {
+        for(i = 0; i < count; i++)
+            data[i] = min(max(data[i], 0), 6);
+    }
+        size_t i;
+        if (act == KLA_RELU)
+        {
+                for (i = 0; i < count; i++)
+                        data[i] = max(data[i], 0);
+        }
+        else if (act == KLA_RELU6)
+        {
+                for (i = 0; i < count; i++)
+                        data[i] = min(max(data[i], 0), 6);
+        }
+}
 static void kpu_kmodel_add(const kpu_model_add_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
     const float *src_a = (const float *)(ctx->main_buffer + arg->main_mem_in_a_address);
     const float *src_b = (const float *)(ctx->main_buffer + arg->main_mem_in_b_address);
     float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
     size_t i, count = arg->count;
     for(i = 0; i < count; i++)
         dest[i] = src_a[i] + src_b[i];
+        const float *src_a = (const float *)(ctx->main_buffer + arg->main_mem_in_a_address);
+        const float *src_b = (const float *)(ctx->main_buffer + arg->main_mem_in_b_address);
+        float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+        size_t i, count = arg->count;
+        for (i = 0; i < count; i++)
+                dest[i] = src_a[i] + src_b[i];
+}
 static void kpu_quantized_add(const kpu_model_quant_add_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
     const uint8_t *src_a = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_a_address);
     const uint8_t *src_b = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_b_address);
     size_t count = ALIGN_UP(arg->count, 8) / 8;
     int64_t off_a = arg->in_a_offset, mul_a = arg->in_a_mul, sh_a = arg->in_a_shift;
     int64_t off_b = arg->in_b_offset, mul_b = arg->in_b_mul, sh_b = arg->in_b_shift;
     int64_t off_o = arg->out_offset, mul_o = arg->out_mul, sh_o = arg->out_shift;
     uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
     size_t i;
     if(sh_a == sh_b)
+    {
+        const uint8_t *src_a = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_a_address);
+        const uint8_t *src_b = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_b_address);
+        size_t count = ALIGN_UP(arg->count, 8) / 8;
+        int64_t off_a = arg->in_a_offset, mul_a = arg->in_a_mul, sh_a = arg->in_a_shift;
+        int64_t off_b = arg->in_b_offset, mul_b = arg->in_b_mul, sh_b = arg->in_b_shift;
+        int64_t off_o = arg->out_offset, mul_o = arg->out_mul, sh_o = arg->out_shift;
+        uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
+        size_t i;
+        if (sh_a == sh_b)
+        {
 #define QADD_UNROLL_1(x)     \
     int64_t a##x = *src_a++; \
     int64_t b##x = *src_b++;
+        int64_t a##x = *src_a++; \
+        int64_t b##x = *src_b++;
 #define QADD_UNROLL_2(x) \
     a##x += off_a;       \
     b##x += off_b;
+        a##x += off_a;       \
+        b##x += off_b;
 #define QADD_UNROLL_3(x) \
     a##x *= mul_a;       \
     b##x *= mul_b;
+        a##x *= mul_a;       \
+        b##x *= mul_b;
 #define QADD_UNROLL_4(x) \
     int64_t v##x = a##x + b##x;
+        int64_t v##x = a##x + b##x;
 #define QADD_UNROLL_5(x) \
     v##x >>= sh_a;
+        v##x >>= sh_a;
 #define QADD_UNROLL_6(x) \
     v##x *= mul_o;
+        v##x *= mul_o;
 #define QADD_UNROLL_7(x) \
     v##x = kpu_carry_shift(v##x, sh_o);
+        v##x = kpu_carry_shift(v##x, sh_o);
 #define QADD_UNROLL_8(x) \
     v##x += off_o;
+        v##x += off_o;
 #define QADD_UNROLL_9(x) \
     v##x = min(0xFF, max(0, v##x));
+        v##x = min(0xFF, max(0, v##x));
 #define QADD_UNROLL_10(x) \
     *dest++ = v##x;
+        *dest++ = v##x;
 #define QADD_UNROLL_S(x)                       \
+    QADD_UNROLL_##x(0)                         \
+        QADD_UNROLL_##x(1)                     \
+            QADD_UNROLL_##x(2)                 \
+                QADD_UNROLL_##x(3)             \
+                    QADD_UNROLL_##x(4)         \
+                        QADD_UNROLL_##x(5)     \
+                            QADD_UNROLL_##x(6) \
+                                QADD_UNROLL_##x(7)
+        for(i = 0; i < count; i++)
+        {
+            QADD_UNROLL_S(1);
+            QADD_UNROLL_S(2);
+            QADD_UNROLL_S(3);
+            QADD_UNROLL_S(4);
+            QADD_UNROLL_S(5);
+            QADD_UNROLL_S(6);
+            QADD_UNROLL_S(7);
+            QADD_UNROLL_S(8);
+            QADD_UNROLL_S(9);
+            QADD_UNROLL_S(10);
+        }
+    } else
+    {
+        QADD_UNROLL_##x(0)                         \
+                QADD_UNROLL_##x(1)                     \
+                        QADD_UNROLL_##x(2)                 \
+                                QADD_UNROLL_##x(3)             \
+                                        QADD_UNROLL_##x(4)         \
+                                                QADD_UNROLL_##x(5)     \
+                                                        QADD_UNROLL_##x(6) \
+                                                                QADD_UNROLL_##x(7)
+                for (i = 0; i < count; i++)
+                {
+                        QADD_UNROLL_S(1);
+                        QADD_UNROLL_S(2);
+                        QADD_UNROLL_S(3);
+                        QADD_UNROLL_S(4);
+                        QADD_UNROLL_S(5);
+                        QADD_UNROLL_S(6);
+                        QADD_UNROLL_S(7);
+                        QADD_UNROLL_S(8);
+                        QADD_UNROLL_S(9);
+                        QADD_UNROLL_S(10);
+                }
+        }
+        else
+        {
 #undef QADD_UNROLL_1
 #define QADD_UNROLL_1(x)     \
     int64_t a##x = *src_a++; \
     int64_t b##x = *src_b++;
+        int64_t a##x = *src_a++; \
+        int64_t b##x = *src_b++;
 #undef QADD_UNROLL_2
 #define QADD_UNROLL_2(x) \
     a##x += off_a;       \
     b##x += off_b;
+        a##x += off_a;       \
+        b##x += off_b;
 #undef QADD_UNROLL_3
 #define QADD_UNROLL_3(x) \
     a##x *= mul_a;       \
     b##x *= mul_b;
+        a##x *= mul_a;       \
+        b##x *= mul_b;
 #undef QADD_UNROLL_4
 #define QADD_UNROLL_4(x) \
     a##x >>= sh_a;       \
     b##x >>= sh_b;
+        a##x >>= sh_a;       \
+        b##x >>= sh_b;
 #undef QADD_UNROLL_5
 #define QADD_UNROLL_5(x) \
     int64_t v##x = a##x + b##x;
+        int64_t v##x = a##x + b##x;
 #undef QADD_UNROLL_6
 #define QADD_UNROLL_6(x) \
     v##x *= mul_o;
+        v##x *= mul_o;
 #undef QADD_UNROLL_7
 #define QADD_UNROLL_7(x) \
     v##x = kpu_carry_shift(v##x, sh_o);
+        v##x = kpu_carry_shift(v##x, sh_o);
 #undef QADD_UNROLL_8
 #define QADD_UNROLL_8(x) \
     v##x += off_o;
+        v##x += off_o;
 #undef QADD_UNROLL_9
 #define QADD_UNROLL_9(x) \
     v##x = min(0xFF, max(0, v##x));
+        v##x = min(0xFF, max(0, v##x));
 #undef QADD_UNROLL_10
 #define QADD_UNROLL_10(x) \
     *dest++ = v##x;
+        *dest++ = v##x;
 #undef QADD_UNROLL_S
 #define QADD_UNROLL_S(x)                       \
     QADD_UNROLL_##x(0)                         \
         QADD_UNROLL_##x(1)                     \
             QADD_UNROLL_##x(2)                 \
                 QADD_UNROLL_##x(3)             \
                     QADD_UNROLL_##x(4)         \
                         QADD_UNROLL_##x(5)     \
                             QADD_UNROLL_##x(6) \
                                 QADD_UNROLL_##x(7)
         for(i = 0; i < count; i++)
+        {
             QADD_UNROLL_S(1);
             QADD_UNROLL_S(2);
             QADD_UNROLL_S(3);
             QADD_UNROLL_S(4);
             QADD_UNROLL_S(5);
             QADD_UNROLL_S(6);
             QADD_UNROLL_S(7);
             QADD_UNROLL_S(8);
             QADD_UNROLL_S(9);
             QADD_UNROLL_S(10);
+        }
+    }
+        QADD_UNROLL_##x(0)                         \
+                QADD_UNROLL_##x(1)                     \
+                        QADD_UNROLL_##x(2)                 \
+                                QADD_UNROLL_##x(3)             \
+                                        QADD_UNROLL_##x(4)         \
+                                                QADD_UNROLL_##x(5)     \
+                                                        QADD_UNROLL_##x(6) \
+                                                                QADD_UNROLL_##x(7)
+                for (i = 0; i < count; i++)
+                {
+                        QADD_UNROLL_S(1);
+                        QADD_UNROLL_S(2);
+                        QADD_UNROLL_S(3);
+                        QADD_UNROLL_S(4);
+                        QADD_UNROLL_S(5);
+                        QADD_UNROLL_S(6);
+                        QADD_UNROLL_S(7);
+                        QADD_UNROLL_S(8);
+                        QADD_UNROLL_S(9);
+                        QADD_UNROLL_S(10);
+                }
+        }
+}
 static void kpu_global_average_pool2d(const kpu_model_gap2d_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
     const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
     float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
     size_t oc, channels = arg->channels, kernel_size = arg->kernel_size;
     for(oc = 0; oc < channels; oc++)
+    {
         float sum = 0.f;
         size_t i;
         for(i = 0; i < kernel_size; i++)
             sum += *src++;
         dest[oc] = sum / kernel_size;
+    }
+        const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
+        float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+        size_t oc, channels = arg->channels, kernel_size = arg->kernel_size;
+        for (oc = 0; oc < channels; oc++)
+        {
+                float sum = 0.f;
+                size_t i;
+                for (i = 0; i < kernel_size; i++)
+                        sum += *src++;
+                dest[oc] = sum / kernel_size;
+        }
+}
 static void kpu_quantized_max_pool2d(const kpu_model_quant_max_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
     const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
     uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
     kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
     uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
     uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
     uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
     uint32_t out_y, out_x, oc;
     for(oc = 0; oc < out_shape.channels; oc++)
+    {
         const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
         for(out_y = 0; out_y < out_shape.height; out_y++)
+        {
             for(out_x = 0; out_x < out_shape.width; out_x++)
+            {
                 int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
                 int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
                 int32_t kernel_x_start = max(0, -in_x_origin);
                 int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
                 int32_t kernel_y_start = max(0, -in_y_origin);
                 int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
                 uint8_t value = 0;
                 int32_t kernel_y, kernel_x;
                 for(kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
+                {
                     for(kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
+                    {
                         int32_t in_x = in_x_origin + kernel_x;
                         int32_t in_y = in_y_origin + kernel_y;
                         value = max(value, channel_src[in_y * in_shape.width + in_x]);
+                    }
+                }
                 *dest++ = value;
+            }
+        }
+    }
+        const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
+        uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
+        kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
+        uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
+        uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
+        uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
+        uint32_t out_y, out_x, oc;
+        for (oc = 0; oc < out_shape.channels; oc++)
+        {
+                const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
+                for (out_y = 0; out_y < out_shape.height; out_y++)
+                {
+                        for (out_x = 0; out_x < out_shape.width; out_x++)
+                        {
+                                int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
+                                int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
+                                int32_t kernel_x_start = max(0, -in_x_origin);
+                                int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
+                                int32_t kernel_y_start = max(0, -in_y_origin);
+                                int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
+                                uint8_t value = 0;
+                                int32_t kernel_y, kernel_x;
+                                for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
+                                {
+                                        for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
+                                        {
+                                                int32_t in_x = in_x_origin + kernel_x;
+                                                int32_t in_y = in_y_origin + kernel_y;
+                                                value = max(value, channel_src[in_y * in_shape.width + in_x]);
+                                        }
+                                }
+                                *dest++ = value;
+                        }
+                }
+        }
+}
 static void kpu_average_pool2d(const kpu_model_ave_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
     const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
     float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
     kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
     uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
     uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
     uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
     uint32_t out_y, out_x, oc;
     for(oc = 0; oc < out_shape.channels; oc++)
+    {
         const float *channel_src = src + in_shape.width * in_shape.height * oc;
         for(out_y = 0; out_y < out_shape.height; out_y++)
+        {
             for(out_x = 0; out_x < out_shape.width; out_x++)
+            {
                 int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
                 int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
                 int32_t kernel_x_start = max(0, -in_x_origin);
                 int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
                 int32_t kernel_y_start = max(0, -in_y_origin);
                 int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
                 float value = 0;
                 float kernel_count = 0;
                 int32_t kernel_y, kernel_x;
                 for(kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
+                {
                     for(kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
+                    {
                         int32_t in_x = in_x_origin + kernel_x;
                         int32_t in_y = in_y_origin + kernel_y;
                         value += channel_src[in_y * in_shape.width + in_x];
                         kernel_count++;
+                    }
+                }
                 *dest++ = value / kernel_count;
+            }
+        }
+    }
+        const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
+        float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+        kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
+        uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
+        uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
+        uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
+        uint32_t out_y, out_x, oc;
+        for (oc = 0; oc < out_shape.channels; oc++)
+        {
+                const float *channel_src = src + in_shape.width * in_shape.height * oc;
+                for (out_y = 0; out_y < out_shape.height; out_y++)
+                {
+                        for (out_x = 0; out_x < out_shape.width; out_x++)
+                        {
+                                int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
+                                int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
+                                int32_t kernel_x_start = max(0, -in_x_origin);
+                                int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
+                                int32_t kernel_y_start = max(0, -in_y_origin);
+                                int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
+                                float value = 0;
+                                float kernel_count = 0;
+                                int32_t kernel_y, kernel_x;
+                                for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
+                                {
+                                        for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
+                                        {
+                                                int32_t in_x = in_x_origin + kernel_x;
+                                                int32_t in_y = in_y_origin + kernel_y;
+                                                value += channel_src[in_y * in_shape.width + in_x];
+                                                kernel_count++;
+                                        }
+                                }
+                                *dest++ = value / kernel_count;
+                        }
+                }
+        }
+}
 static void kpu_quantize(const kpu_model_quantize_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
     size_t count = arg->count;
     const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
     kpu_model_quant_param_t q = arg->quant_param;
     float scale = 1.f / q.scale;
     uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->mem_out_address);
     size_t i;
     for(i = 0; i < count; i++)
+    {
         int value = roundf((*src++ - q.bias) * scale);
         if(value < 0)
             value = 0;
         if(value > 0xFF)
             value = 0xFF;
         *dest++ = (uint8_t)value;
+    }
+        size_t count = arg->count;
+        const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
+        kpu_model_quant_param_t q = arg->quant_param;
+        float scale = 1.f / q.scale;
+        uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->mem_out_address);
+        size_t i;
+        for (i = 0; i < count; i++)
+        {
+                int value = roundf((*src++ - q.bias) * scale);
+                if (value < 0)
+                        value = 0;
+                if (value > 0xFF)
+                        value = 0xFF;
+                *dest++ = (uint8_t)value;
+        }
+}
 static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
     const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
     float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
     size_t oc, count = arg->count;
     kpu_model_quant_param_t q = arg->quant_param;
     for(oc = 0; oc < count; oc++)
         dest[oc] = *src++ * q.scale + q.bias;
+        const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
+        float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+        size_t oc, count = arg->count;
+        kpu_model_quant_param_t q = arg->quant_param;
+        for (oc = 0; oc < count; oc++)
+                dest[oc] = *src++ * q.scale + q.bias;
+}
 static void kpu_kmodel_channelwise_dequantize(const kpu_model_channelwise_dequant_argument_t *arg, kpu_model_context_t *ctx)
+{
     const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
     float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
     size_t oc, i, channels = arg->channels, count = arg->channel_size;
     for(oc = 0; oc < channels; oc++)
+    {
         const kpu_model_quant_param_t q = arg->quant_params[oc];
         for(i = 0; i < count; i++)
             *dest++ = *src++ * q.scale + q.bias;
+    }
+        const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
+        float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+        size_t oc, i, channels = arg->channels, count = arg->channel_size;
+        for (oc = 0; oc < channels; oc++)
+        {
+                const kpu_model_quant_param_t q = arg->quant_params[oc];
+                for (i = 0; i < count; i++)
+                        *dest++ = *src++ * q.scale + q.bias;
+        }
+}
 static void kpu_requantize(const kpu_model_requantize_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
+    uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
+    size_t oc, count = arg->count;
+    const uint8_t *table = arg->table;
+    if(false && count % 8 == 0)
+    {
+        for(oc = 0; oc < count;)
+        {
+            dest[oc++] = table[*src++];
+            dest[oc++] = table[*src++];
+            dest[oc++] = table[*src++];
+            dest[oc++] = table[*src++];
+            dest[oc++] = table[*src++];
+            dest[oc++] = table[*src++];
+            dest[oc++] = table[*src++];
+            dest[oc++] = table[*src++];
+        }
+    } else
+    {
+        for(oc = 0; oc < count; oc++)
+            dest[oc] = table[src[oc]];
+    }
+        const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
+        uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
+        size_t oc, count = arg->count;
+        const uint8_t *table = arg->table;
+        if (false && count % 8 == 0)
+        {
+                for (oc = 0; oc < count;)
+                {
+                        dest[oc++] = table[*src++];
+                        dest[oc++] = table[*src++];
+                        dest[oc++] = table[*src++];
+                        dest[oc++] = table[*src++];
+                        dest[oc++] = table[*src++];
+                        dest[oc++] = table[*src++];
+                        dest[oc++] = table[*src++];
+                        dest[oc++] = table[*src++];
+                }
+        }
+        else
+        {
+                for (oc = 0; oc < count; oc++)
+                        dest[oc] = table[src[oc]];
+        }
+}
 static void kpu_l2_normalization(const kpu_model_l2_norm_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
     const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
     float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
     size_t oc, channels = arg->channels;
     float sum = 0.f;
     const float epsilon = 1e-10f;
     for(oc = 0; oc < channels; oc++)
         sum += src[oc] * src[oc];
     if(sum < epsilon)
         sum = epsilon;
     sum = 1.f / sqrtf(sum);
     for(oc = 0; oc < channels; oc++)
         dest[oc] = src[oc] * sum;
+        const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
+        float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+        size_t oc, channels = arg->channels;
+        float sum = 0.f;
+        const float epsilon = 1e-10f;
+        for (oc = 0; oc < channels; oc++)
+                sum += src[oc] * src[oc];
+        if (sum < epsilon)
+                sum = epsilon;
+        sum = 1.f / sqrtf(sum);
+        for (oc = 0; oc < channels; oc++)
+                dest[oc] = src[oc] * sum;
+}
 static void kpu_softmax(const kpu_model_softmax_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
     const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
     float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
     size_t oc, channels = arg->channels;
     float max = FLT_MIN;
     for(oc = 0; oc < channels; oc++)
         max = fmaxf(max, src[oc]);
     float sum = 0.f;
     for(oc = 0; oc < channels; oc++)
+    {
         float value = expf(src[oc] - max);
         sum += value;
         dest[oc] = value;
+    }
     for(oc = 0; oc < channels; oc++)
         dest[oc] /= sum;
+        const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
+        float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+        size_t oc, channels = arg->channels;
+        float max = FLT_MIN;
+        for (oc = 0; oc < channels; oc++)
+                max = fmaxf(max, src[oc]);
+        float sum = 0.f;
+        for (oc = 0; oc < channels; oc++)
+        {
+                float value = expf(src[oc] - max);
+                sum += value;
+                dest[oc] = value;
+        }
+        for (oc = 0; oc < channels; oc++)
+                dest[oc] /= sum;
+}
 static void kpu_concat(const kpu_model_concat_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
     uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
     uint32_t count = arg->input_count, i;
     for(i = 0; i < count; i++)
+    {
         kpu_model_memory_range_t input = arg->inputs_mem[i];
         const uint8_t *src = (const uint8_t *)(ctx->main_buffer + input.start);
         memcpy(dest, src, input.size);
         dest += input.size;
+    }
+        uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
+        uint32_t count = arg->input_count, i;
+        for (i = 0; i < count; i++)
+        {
+                kpu_model_memory_range_t input = arg->inputs_mem[i];
+                const uint8_t *src = (const uint8_t *)(ctx->main_buffer + input.start);
+                memcpy(dest, src, input.size);
+                dest += input.size;
+        }
+}
 static void kpu_kmodel_fully_connected(const kpu_model_fully_connected_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
     const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
     float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
     uint32_t in_channels = arg->in_channels, out_channels = arg->out_channels, ic, oc;
     float *weights = (float *)malloc(in_channels * out_channels * sizeof(float));
     float *bias = (float *)malloc(out_channels * sizeof(float));
     memcpy(weights, arg->weights, out_channels * in_channels * sizeof(float));
     memcpy(bias, arg->weights + in_channels * out_channels, out_channels * sizeof(float));
     if(in_channels % 8 == 0)
+    {
+        const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
+        float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+        uint32_t in_channels = arg->in_channels, out_channels = arg->out_channels, ic, oc;
+        float *weights = (float *)malloc(in_channels * out_channels * sizeof(float));
+        float *bias = (float *)malloc(out_channels * sizeof(float));
+        memcpy(weights, arg->weights, out_channels * in_channels * sizeof(float));
+        memcpy(bias, arg->weights + in_channels * out_channels, out_channels * sizeof(float));
+        if (in_channels % 8 == 0)
+        {
 #define FC_UNROLL_1(x)     \
     float i##x = *c_src++; \
     float w##x = *c_weights++;
+        float i##x = *c_src++; \
+        float w##x = *c_weights++;
 #define FC_UNROLL_2(x) \
     sum += i##x * w##x;
+        sum += i##x * w##x;
 #define FC_UNROLL_S(x)                       \
+    FC_UNROLL_##x(0)                         \
+        FC_UNROLL_##x(1)                     \
+            FC_UNROLL_##x(2)                 \
+                FC_UNROLL_##x(3)             \
+                    FC_UNROLL_##x(4)         \
+                        FC_UNROLL_##x(5)     \
+                            FC_UNROLL_##x(6) \
+                                FC_UNROLL_##x(7)
+        for(oc = 0; oc < out_channels; oc++)
+        {
+            const float *c_src = src;
+            const float *c_weights = weights + oc * in_channels;
+            float sum = 0.0f;
+            for(ic = 0; ic < in_channels / 8; ic++)
+            {
+                FC_UNROLL_S(1);
+                FC_UNROLL_S(2);
+            }
+            dest[oc] = sum + bias[oc];
+        }
+    } else
+    {
+        for(oc = 0; oc < out_channels; oc++)
+        {
+            const float *c_weights = weights + oc * in_channels;
+            float sum = 0.0f;
+            for(ic = 0; ic < in_channels; ic++)
+                sum += src[ic] * c_weights[ic];
+            dest[oc] = sum + bias[oc];
+        }
+    }
+    free(weights);
+    free(bias);
+    kpu_float_activation(dest, out_channels, arg->act);
+        FC_UNROLL_##x(0)                         \
+                FC_UNROLL_##x(1)                     \
+                        FC_UNROLL_##x(2)                 \
+                                FC_UNROLL_##x(3)             \
+                                        FC_UNROLL_##x(4)         \
+                                                FC_UNROLL_##x(5)     \
+                                                        FC_UNROLL_##x(6) \
+                                                                FC_UNROLL_##x(7)
+                for (oc = 0; oc < out_channels; oc++)
+                {
+                        const float *c_src = src;
+                        const float *c_weights = weights + oc * in_channels;
+                        float sum = 0.0f;
+                        for (ic = 0; ic < in_channels / 8; ic++)
+                        {
+                                FC_UNROLL_S(1);
+                                FC_UNROLL_S(2);
+                        }
+                        dest[oc] = sum + bias[oc];
+                }
+        }
+        else
+        {
+                for (oc = 0; oc < out_channels; oc++)
+                {
+                        const float *c_weights = weights + oc * in_channels;
+                        float sum = 0.0f;
+                        for (ic = 0; ic < in_channels; ic++)
+                                sum += src[ic] * c_weights[ic];
+                        dest[oc] = sum + bias[oc];
+                }
+        }
+        free(weights);
+        free(bias);
+        kpu_float_activation(dest, out_channels, arg->act);
+}
 static void kpu_tf_flatten(const kpu_model_tf_flatten_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
     const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
     float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
     kpu_model_shape_t in_shape = arg->shape;
     uint32_t oc, oy, ox;
     for(oy = 0; oy < in_shape.height; oy++)
         for(ox = 0; ox < in_shape.width; ox++)
             for(oc = 0; oc < in_shape.channels; oc++)
                 *dest++ = src[(oc * in_shape.height + oy) * in_shape.width + ox];
+        const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
+        float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+        kpu_model_shape_t in_shape = arg->shape;
+        uint32_t oc, oy, ox;
+        for (oy = 0; oy < in_shape.height; oy++)
+                for (ox = 0; ox < in_shape.width; ox++)
+                        for (oc = 0; oc < in_shape.channels; oc++)
+                                *dest++ = src[(oc * in_shape.height + oy) * in_shape.width + ox];
+}
 static void kpu_resize_nearest_neighbor(const kpu_model_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
     const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
     float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
     kpu_model_shape_t in_shape = arg->in_shape;
     uint32_t out_width = arg->out_width, out_height = arg->out_height;
     uint32_t oc, oy, ox;
     float height_scale = (float)in_shape.height / out_height;
     float width_scale = (float)in_shape.width / out_width;
     for(oc = 0; oc < in_shape.channels; oc++)
+    {
         const float *channel_src = src + in_shape.width * in_shape.height * oc;
         for(oy = 0; oy < out_height; oy++)
+        {
             uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
             const float *y_origin = channel_src + in_y * in_shape.width;
             for(ox = 0; ox < out_width; ox++)
+            {
                 uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
                 *dest++ = y_origin[in_x];
+            }
+        }
+    }
+        const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
+        float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+        kpu_model_shape_t in_shape = arg->in_shape;
+        uint32_t out_width = arg->out_width, out_height = arg->out_height;
+        uint32_t oc, oy, ox;
+        float height_scale = (float)in_shape.height / out_height;
+        float width_scale = (float)in_shape.width / out_width;
+        for (oc = 0; oc < in_shape.channels; oc++)
+        {
+                const float *channel_src = src + in_shape.width * in_shape.height * oc;
+                for (oy = 0; oy < out_height; oy++)
+                {
+                        uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
+                        const float *y_origin = channel_src + in_y * in_shape.width;
+                        for (ox = 0; ox < out_width; ox++)
+                        {
+                                uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
+                                *dest++ = y_origin[in_x];
+                        }
+                }
+        }
+}
 static void kpu_quant_resize_nearest_neighbor(const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
     const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
     uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
     kpu_model_shape_t in_shape = arg->in_shape;
     uint32_t out_width = arg->out_width, out_height = arg->out_height;
     uint32_t oc, oy, ox;
     float height_scale = (float)in_shape.height / out_height;
     float width_scale = (float)in_shape.width / out_width;
     for(oc = 0; oc < in_shape.channels; oc++)
+    {
         const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
         for(oy = 0; oy < out_height; oy++)
+        {
             uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
             const uint8_t *y_origin = channel_src + in_y * in_shape.width;
             for(ox = 0; ox < out_width; ox++)
+            {
                 uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
                 *dest++ = y_origin[in_x];
+            }
+        }
+    }
+        const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
+        uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
+        kpu_model_shape_t in_shape = arg->in_shape;
+        uint32_t out_width = arg->out_width, out_height = arg->out_height;
+        uint32_t oc, oy, ox;
+        float height_scale = (float)in_shape.height / out_height;
+        float width_scale = (float)in_shape.width / out_width;
+        for (oc = 0; oc < in_shape.channels; oc++)
+        {
+                const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
+                for (oy = 0; oy < out_height; oy++)
+                {
+                        uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
+                        const uint8_t *y_origin = channel_src + in_y * in_shape.width;
+                        for (ox = 0; ox < out_width; ox++)
+                        {
+                                uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
+                                *dest++ = y_origin[in_x];
+                        }
+                }
+        }
+}
 static void kpu_logistic(const kpu_model_logistic_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
     const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
     float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
     size_t oc, channels = arg->channels;
     for(oc = 0; oc < channels; oc++)
         dest[oc] = 1.f / (1.f + expf(-src[oc]));
+        const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
+        float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
+        size_t oc, channels = arg->channels;
+        for (oc = 0; oc < channels; oc++)
+                dest[oc] = 1.f / (1.f + expf(-src[oc]));
+}
 static void kpu_conv(const kpu_model_conv_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
+    volatile kpu_layer_argument_t layer = *(const volatile kpu_layer_argument_t *)(ctx->model_buffer + arg->layer_offset);
+    layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - IOMEM;
+    layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - IOMEM;
+    layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - IOMEM;
+    if(arg->flags & KLF_MAIN_MEM_OUT)
+    {
+        dmac_channel_number_t dma_ch = ctx->dma_ch;
+        uint8_t *dest = ctx->main_buffer + arg->main_mem_out_address;
+        kpu->interrupt_clear.data = (kpu_config_interrupt_t){
+            .calc_done_int = 1,
+            .layer_cfg_almost_empty_int = 1,
+            .layer_cfg_almost_full_int = 1};
+        kpu->interrupt_mask.data = (kpu_config_interrupt_t){
+            .calc_done_int = 1,
+            .layer_cfg_almost_empty_int = 1,
+            .layer_cfg_almost_full_int = 1};
+        layer.dma_parameter.data.send_data_out = 1;
+        select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
+        if(ctx->current_layer < ctx->layers_length)
+            dmac_set_irq(dma_ch, ai_step, ctx, 1);
+        else
+            dmac_set_irq(dma_ch, (plic_irq_callback_t)kpu_kmodel_done, ctx, 1);
+        dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
+                            DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8);
+    } else
+    {
+        kpu->interrupt_clear.data = (kpu_config_interrupt_t){
+            .calc_done_int = 1,
+            .layer_cfg_almost_empty_int = 1,
+            .layer_cfg_almost_full_int = 1};
+        kpu->interrupt_mask.data = (kpu_config_interrupt_t){
+            .calc_done_int = 0,
+            .layer_cfg_almost_empty_int = 1,
+            .layer_cfg_almost_full_int = 1};
+        layer.interrupt_enabe.data.int_en = 1;
+    }
+    kpu_send_layer((const kpu_layer_argument_t *)&layer);
+        volatile kpu_layer_argument_t layer = *(const volatile kpu_layer_argument_t *)(ctx->model_buffer + arg->layer_offset);
+        layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - IOMEM;
+        layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - IOMEM;
+        layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - IOMEM;
+        if (arg->flags & KLF_MAIN_MEM_OUT)
+        {
+                dmac_channel_number_t dma_ch = ctx->dma_ch;
+                uint8_t *dest = ctx->main_buffer + arg->main_mem_out_address;
+                kpu->interrupt_clear.data = (kpu_config_interrupt_t){
+                        .calc_done_int = 1,
+                        .layer_cfg_almost_empty_int = 1,
+                        .layer_cfg_almost_full_int = 1};
+                kpu->interrupt_mask.data = (kpu_config_interrupt_t){
+                        .calc_done_int = 1,
+                        .layer_cfg_almost_empty_int = 1,
+                        .layer_cfg_almost_full_int = 1};
+                layer.dma_parameter.data.send_data_out = 1;
+                select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
+                if (ctx->current_layer < ctx->layers_length)
+                        dmac_set_irq(dma_ch, ai_step, ctx, 1);
+                else
+                        dmac_set_irq(dma_ch, (plic_irq_callback_t)kpu_kmodel_done, ctx, 1);
+                dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
+                                                         DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8);
+        }
+        else
+        {
+                kpu->interrupt_clear.data = (kpu_config_interrupt_t){
+                        .calc_done_int = 1,
+                        .layer_cfg_almost_empty_int = 1,
+                        .layer_cfg_almost_full_int = 1};
+                kpu->interrupt_mask.data = (kpu_config_interrupt_t){
+                        .calc_done_int = 0,
+                        .layer_cfg_almost_empty_int = 1,
+                        .layer_cfg_almost_full_int = 1};
+                layer.interrupt_enabe.data.int_en = 1;
+        }
+        kpu_send_layer((const kpu_layer_argument_t *)&layer);
+}
 static void kpu_add_padding(const kpu_model_add_padding_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
     const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
+        const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
 #if USE_CACHED_AI_RAM
     uint8_t *dest = (uint8_t *)(uintptr_t)(AI_RAM_BASE_ADDR + arg->kpu_mem_out_address * 64);
+        uint8_t *dest = (uint8_t *)(uintptr_t)(AI_RAM_BASE_ADDR + arg->kpu_mem_out_address * 64);
 #else
     uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + arg->kpu_mem_out_address * 64);
+        uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + arg->kpu_mem_out_address * 64);
 #endif
     uint32_t row_padding = 16;
     uint32_t row_group = 4;
     uint32_t row_length = 1;
     uint32_t height = 4;
     uint32_t oc, x, y, channels = arg->channels;
     for(oc = 0; oc < channels; oc++)
+    {
         uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
         for(y = 0; y < 1; y++)
+        {
             uint8_t *y_origin = channel_origin + y * row_length * 64;
             for(x = 0; x < 1; x++)
                 y_origin[x] = *src++;
+        }
+    }
+        uint32_t row_padding = 16;
+        uint32_t row_group = 4;
+        uint32_t row_length = 1;
+        uint32_t height = 4;
+        uint32_t oc, x, y, channels = arg->channels;
+        for (oc = 0; oc < channels; oc++)
+        {
+                uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
+                for (y = 0; y < 1; y++)
+                {
+                        uint8_t *y_origin = channel_origin + y * row_length * 64;
+                        for (x = 0; x < 1; x++)
+                                y_origin[x] = *src++;
+                }
+        }
 #if USE_CACHED_AI_RAM
     uint32_t lines = row_length * height * channels / row_group;
     kpu_flush_cache(arg->kpu_mem_out_address, lines);
+        uint32_t lines = row_length * height * channels / row_group;
+        kpu_flush_cache(arg->kpu_mem_out_address, lines);
 #endif
+}
 …
 static void kpu_remove_padding(const kpu_model_remove_padding_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
     const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
     uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
     uint32_t oc, channels = arg->channels;
     for(oc = 0; oc < channels; oc++)
         *dest++ = src[oc * 16];
+        const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
+        uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
+        uint32_t oc, channels = arg->channels;
+        for (oc = 0; oc < channels; oc++)
+                *dest++ = src[oc * 16];
+}
 static void kpu_upload(const kpu_model_upload_layer_argument_t *arg, kpu_model_context_t *ctx)
+{
     size_t width = arg->width;
     size_t height = arg->height;
     size_t channels = arg->channels;
     kpu_upload_core(width, height, channels, ctx->main_buffer + arg->main_mem_in_address, arg->kpu_mem_out_address);
+        size_t width = arg->width;
+        size_t height = arg->height;
+        size_t channels = arg->channels;
+        kpu_upload_core(width, height, channels, ctx->main_buffer + arg->main_mem_in_address, arg->kpu_mem_out_address);
+}
 …
+{
 #if FIX_CACHE
     configASSERT(is_memory_cache((uintptr_t)buffer));
+        configASSERT(is_memory_cache((uintptr_t)buffer));
 #endif
+    uintptr_t base_addr = (uintptr_t)buffer;
+    const kpu_kmodel_header_t *header = (const kpu_kmodel_header_t *)buffer;
+    if (header->version == 3 && header->arch == 0)
+    {
+        ctx->is_nncase = 0;
+        ctx->model_buffer = buffer;
+        ctx->output_count = header->output_count;
+        ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t));
+        ctx->layer_headers = (const kpu_model_layer_header_t *)((uintptr_t)ctx->outputs + sizeof(kpu_model_output_t) * ctx->output_count);
+        ctx->layers_length = header->layers_length;
+        ctx->body_start = (const uint8_t *)((uintptr_t)ctx->layer_headers + sizeof(kpu_model_layer_header_t) * header->layers_length);
+        ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage);
+        if (!ctx->main_buffer)
+            return -1;
+        uint32_t body_size = 0;
+        for (int i=0; i<ctx->layers_length; i++)
+        {
+            const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + i;
+            body_size += cnt_layer_header->body_size;
+        }
+        uint8_t *body_start_iomem = (uint8_t *)((uintptr_t)ctx->body_start - IOMEM);
+        const uint8_t *body_start_cache = ctx->body_start;
+        memcpy(body_start_iomem, body_start_cache, body_size);
+        for (int i=0; i<body_size; i++)
+        {
+            configASSERT(body_start_iomem[i] == body_start_cache[i]);
+        }
+    } else
+    {
+        return -1;
+    }
+    return 0;
+        uintptr_t base_addr = (uintptr_t)buffer;
+        const kpu_kmodel_header_t *header = (const kpu_kmodel_header_t *)buffer;
+        if (header->version == 3 && header->arch == 0)
+        {
+                ctx->model_buffer = buffer;
+                ctx->output_count = header->output_count;
+                ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t));
+                ctx->layer_headers = (const kpu_model_layer_header_t *)((uintptr_t)ctx->outputs + sizeof(kpu_model_output_t) * ctx->output_count);
+                ctx->layers_length = header->layers_length;
+                ctx->body_start = (const uint8_t *)((uintptr_t)ctx->layer_headers + sizeof(kpu_model_layer_header_t) * header->layers_length);
+                ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage);
+                if (!ctx->main_buffer)
+                        return -1;
+                uint32_t body_size = 0;
+                for (int i = 0; i < ctx->layers_length; i++)
+                {
+                        const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + i;
+                        body_size += cnt_layer_header->body_size;
+                }
+                uint8_t *body_start_iomem = (uint8_t *)((uintptr_t)ctx->body_start - IOMEM);
+                const uint8_t *body_start_cache = ctx->body_start;
+                memcpy(body_start_iomem, body_start_cache, body_size);
+                for (int i = 0; i < body_size; i++)
+                {
+                        configASSERT(body_start_iomem[i] == body_start_cache[i]);
+                }
+        }
+        else
+        {
+                return -1;
+        }
+        return 0;
+}
 int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
+{
+    if(ctx->is_nncase)
+        return -1;
+    if(index >= ctx->output_count)
+        return -1;
+    const kpu_model_output_t *output = ctx->outputs + index;
+    *data = ctx->main_buffer + output->address;
+    *size = output->size;
+    return 0;
+        if (index >= ctx->output_count)
+                return -1;
+        const kpu_model_output_t *output = ctx->outputs + index;
+        *data = ctx->main_buffer + output->address;
+        *size = output->size;
+        return 0;
+}
 void kpu_model_free(kpu_model_context_t *ctx)
+{
+    if(ctx->is_nncase)
+        return;
+    free(ctx->main_buffer);
+    ctx->main_buffer = NULL;
+        free(ctx->main_buffer);
+        ctx->main_buffer = NULL;
+}
 …
 static const char *str_layer_type(uint32_t type)
+{
     switch(type)
+    {
         case KL_ADD:
             return "Add";
         case KL_QUANTIZED_ADD:
             return "QuantAdd";
         case KL_GLOBAL_AVERAGE_POOL2D:
             return "GAP";
         case KL_QUANTIZED_MAX_POOL2D:
             return "QuantMaxPool2d";
         case KL_AVERAGE_POOL2D:
             return "AveragePool2d";
         case KL_QUANTIZE:
             return "Quantize";
         case KL_DEQUANTIZE:
             return "Dequantize";
         case KL_REQUANTIZE:
             return "Requantize";
         case KL_L2_NORMALIZATION:
             return "L2Norm";
         case KL_SOFTMAX:
             return "Softmax";
         case KL_CONCAT:
             return "Concat";
         case KL_QUANTIZED_CONCAT:
             return "QuantConcat";
         case KL_FULLY_CONNECTED:
             return "FullyConnected";
         case KL_TENSORFLOW_FLATTEN:
             return "TFFlatten";
         case KL_RESIZE_NEAREST_NEIGHBOR:
             return "ResizeNearestNeighbor";
         case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
             return "QuantResizeNearestNeighbor";
         case KL_CHANNELWISE_DEQUANTIZE:
             return "ChannelwiseDequantize";
         case KL_LOGISTIC:
             return "Logistic";
         case KL_K210_CONV:
             return "K210Conv";
         case KL_K210_ADD_PADDING:
             return "K210AddPad";
         case KL_K210_REMOVE_PADDING:
             return "K210RemovePad";
         case KL_K210_UPLOAD:
             return "K210Upload";
         default:
             return "Unknown";
+    }
+        switch (type)
+        {
+        case KL_ADD:
+                return "Add";
+        case KL_QUANTIZED_ADD:
+                return "QuantAdd";
+        case KL_GLOBAL_AVERAGE_POOL2D:
+                return "GAP";
+        case KL_QUANTIZED_MAX_POOL2D:
+                return "QuantMaxPool2d";
+        case KL_AVERAGE_POOL2D:
+                return "AveragePool2d";
+        case KL_QUANTIZE:
+                return "Quantize";
+        case KL_DEQUANTIZE:
+                return "Dequantize";
+        case KL_REQUANTIZE:
+                return "Requantize";
+        case KL_L2_NORMALIZATION:
+                return "L2Norm";
+        case KL_SOFTMAX:
+                return "Softmax";
+        case KL_CONCAT:
+                return "Concat";
+        case KL_QUANTIZED_CONCAT:
+                return "QuantConcat";
+        case KL_FULLY_CONNECTED:
+                return "FullyConnected";
+        case KL_TENSORFLOW_FLATTEN:
+                return "TFFlatten";
+        case KL_RESIZE_NEAREST_NEIGHBOR:
+                return "ResizeNearestNeighbor";
+        case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
+                return "QuantResizeNearestNeighbor";
+        case KL_CHANNELWISE_DEQUANTIZE:
+                return "ChannelwiseDequantize";
+        case KL_LOGISTIC:
+                return "Logistic";
+        case KL_K210_CONV:
+                return "K210Conv";
+        case KL_K210_ADD_PADDING:
+                return "K210AddPad";
+        case KL_K210_REMOVE_PADDING:
+                return "K210RemovePad";
+        case KL_K210_UPLOAD:
+                return "K210Upload";
+        default:
+                return "Unknown";
+        }
+}
 #endif
 …
 static int kpu_kmodel_done(kpu_model_context_t *ctx)
+{
     kpu->interrupt_clear.data = (kpu_config_interrupt_t){
         .calc_done_int = 1,
         .layer_cfg_almost_empty_int = 1,
         .layer_cfg_almost_full_int = 1};
     kpu->interrupt_mask.data = (kpu_config_interrupt_t){
         .calc_done_int = 1,
         .layer_cfg_almost_empty_int = 1,
         .layer_cfg_almost_full_int = 1};
+        kpu->interrupt_clear.data = (kpu_config_interrupt_t){
+                .calc_done_int = 1,
+                .layer_cfg_almost_empty_int = 1,
+                .layer_cfg_almost_full_int = 1};
+        kpu->interrupt_mask.data = (kpu_config_interrupt_t){
+                .calc_done_int = 1,
+                .layer_cfg_almost_empty_int = 1,
+                .layer_cfg_almost_full_int = 1};
 #if KPU_DEBUG
     uint32_t cnt_layer_id = ctx->current_layer - 1;
     uint64_t time = sysctl_get_time_us();
     if(last_time != 0)
+    {
         uint64_t layer_time = time - last_time;
         syslog(LOG_NOTICE, "layer %d [%s]: %f ms", cnt_layer_id, str_layer_type(last_layer_type), layer_time / 1000.0);
         total_time += layer_time;
         if(last_layer_type == KL_K210_CONV)
             kpu_time += layer_time;
+    }
     syslog(LOG_NOTICE, "KPU: %f ms", kpu_time / 1000.0);
     syslog(LOG_NOTICE, "CPU: %f ms", (total_time - kpu_time) / 1000.0);
     syslog(LOG_NOTICE, "Model: %f ms", total_time / 1000.0);
+        uint32_t cnt_layer_id = ctx->current_layer;
+        uint64_t time = sysctl_get_time_us();
+        if (last_time != 0)
+        {
+                uint64_t layer_time = time - last_time;
+                syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000);
+                total_time += layer_time;
+                if (last_layer_type == KL_K210_CONV)
+                        kpu_time += layer_time;
+        }
+        syslog(LOG_NOTICE, "KPU: %d.%03d ms", kpu_time / 1000, kpu_time % 1000);
+        syslog(LOG_NOTICE, "CPU: %d.%03d ms", (total_time - kpu_time) / 1000, (total_time - kpu_time) % 1000);
+        syslog(LOG_NOTICE, "Model: %d.%03d ms", total_time / 1000, total_time % 1000);
 #endif
     ctx->done_callback(ctx->userdata);
     return 0;
+        ctx->done_callback(ctx->userdata);
+        return 0;
+}
 static int ai_step(void *userdata)
+{
+    kpu_model_context_t *ctx = (kpu_model_context_t *)userdata;
+    uint32_t cnt_layer_id = ctx->current_layer;
+    const uint8_t *layer_body = ctx->current_body;
+    const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + cnt_layer_id;
+    if (cnt_layer_id >= ctx->layers_length) {
+        //syslog(LOG_NOTICE, "overrun");
+        kpu_kmodel_done(ctx);
+        return -1;
+    }
+    ctx->current_layer++;
+    ctx->current_body += cnt_layer_header->body_size;
+        kpu_model_context_t *ctx = (kpu_model_context_t *)userdata;
+        uint32_t cnt_layer_id = ctx->current_layer;
+        const uint8_t *layer_body = ctx->current_body;
+        const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + cnt_layer_id;
+        if (cnt_layer_id >= ctx->layers_length)
+        {
+                //syslog(LOG_NOTICE, "overrun");
+                kpu_kmodel_done(ctx);
+                return -1;
+        }
+        ctx->current_layer++;
+        ctx->current_body += cnt_layer_header->body_size;
 #if KPU_DEBUG
     uint64_t time = sysctl_get_time_us();
     if(last_time != 0)
+    {
         uint64_t layer_time = time - last_time;
         syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000);
         total_time += layer_time;
         if(last_layer_type == KL_K210_CONV)
             kpu_time += layer_time;
+    }
     last_layer_type = cnt_layer_header->type;
     last_time = sysctl_get_time_us();
+        uint64_t time = sysctl_get_time_us();
+        if (last_time != 0)
+        {
+                uint64_t layer_time = time - last_time;
+                syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000);
+                total_time += layer_time;
+                if (last_layer_type == KL_K210_CONV)
+                        kpu_time += layer_time;
+        }
+        last_layer_type = cnt_layer_header->type;
+        last_time = sysctl_get_time_us();
 #endif
     switch(cnt_layer_header->type)
+    {
         case KL_ADD:
             kpu_kmodel_add((const kpu_model_add_layer_argument_t *)layer_body, ctx);
             break;
         case KL_QUANTIZED_ADD:
             kpu_quantized_add((const kpu_model_quant_add_layer_argument_t *)layer_body, ctx);
             break;
         case KL_GLOBAL_AVERAGE_POOL2D:
             kpu_global_average_pool2d((const kpu_model_gap2d_layer_argument_t *)layer_body, ctx);
             break;
         case KL_QUANTIZED_MAX_POOL2D:
             kpu_quantized_max_pool2d((const kpu_model_quant_max_pool2d_layer_argument_t *)layer_body, ctx);
             break;
         case KL_AVERAGE_POOL2D:
             kpu_average_pool2d((const kpu_model_ave_pool2d_layer_argument_t *)layer_body, ctx);
             break;
         case KL_QUANTIZE:
             kpu_quantize((const kpu_model_quantize_layer_argument_t *)layer_body, ctx);
             break;
         case KL_DEQUANTIZE:
             kpu_kmodel_dequantize((const kpu_model_dequantize_layer_argument_t *)layer_body, ctx);
             break;
         case KL_REQUANTIZE:
             kpu_requantize((const kpu_model_requantize_layer_argument_t *)layer_body, ctx);
             break;
         case KL_L2_NORMALIZATION:
             kpu_l2_normalization((const kpu_model_l2_norm_layer_argument_t *)layer_body, ctx);
             break;
         case KL_SOFTMAX:
             kpu_softmax((const kpu_model_softmax_layer_argument_t *)layer_body, ctx);
             break;
         case KL_CONCAT:
         case KL_QUANTIZED_CONCAT:
             kpu_concat((const kpu_model_concat_layer_argument_t *)layer_body, ctx);
             break;
         case KL_FULLY_CONNECTED:
             kpu_kmodel_fully_connected((const kpu_model_fully_connected_layer_argument_t *)layer_body, ctx);
             break;
         case KL_TENSORFLOW_FLATTEN:
             kpu_tf_flatten((const kpu_model_tf_flatten_layer_argument_t *)layer_body, ctx);
             break;
         case KL_RESIZE_NEAREST_NEIGHBOR:
             kpu_resize_nearest_neighbor((const kpu_model_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
             break;
         case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
             kpu_quant_resize_nearest_neighbor((const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
             break;
         case KL_CHANNELWISE_DEQUANTIZE:
             kpu_kmodel_channelwise_dequantize((const kpu_model_channelwise_dequant_argument_t *)layer_body, ctx);
             break;
         case KL_LOGISTIC:
             kpu_logistic((const kpu_model_logistic_layer_argument_t *)layer_body, ctx);
             break;
         case KL_K210_CONV:
             kpu_conv((const kpu_model_conv_layer_argument_t *)layer_body, ctx);
             return 0;
         case KL_K210_ADD_PADDING:
             kpu_add_padding((const kpu_model_add_padding_layer_argument_t *)layer_body, ctx);
             break;
         case KL_K210_REMOVE_PADDING:
             kpu_remove_padding((const kpu_model_remove_padding_layer_argument_t *)layer_body, ctx);
             break;
         case KL_K210_UPLOAD:
             kpu_upload((const kpu_model_upload_layer_argument_t *)layer_body, ctx);
             break;
         default:
             assert(!"Layer is not supported.");
             kpu_kmodel_done(ctx);
             return -1;
+    }
     if (ctx->current_layer < ctx->layers_length)
         ai_step(userdata);
     else
         kpu_kmodel_done(ctx);
     return 0;
+        switch (cnt_layer_header->type)
+        {
+        case KL_ADD:
+                kpu_kmodel_add((const kpu_model_add_layer_argument_t *)layer_body, ctx);
+                break;
+        case KL_QUANTIZED_ADD:
+                kpu_quantized_add((const kpu_model_quant_add_layer_argument_t *)layer_body, ctx);
+                break;
+        case KL_GLOBAL_AVERAGE_POOL2D:
+                kpu_global_average_pool2d((const kpu_model_gap2d_layer_argument_t *)layer_body, ctx);
+                break;
+        case KL_QUANTIZED_MAX_POOL2D:
+                kpu_quantized_max_pool2d((const kpu_model_quant_max_pool2d_layer_argument_t *)layer_body, ctx);
+                break;
+        case KL_AVERAGE_POOL2D:
+                kpu_average_pool2d((const kpu_model_ave_pool2d_layer_argument_t *)layer_body, ctx);
+                break;
+        case KL_QUANTIZE:
+                kpu_quantize((const kpu_model_quantize_layer_argument_t *)layer_body, ctx);
+                break;
+        case KL_DEQUANTIZE:
+                kpu_kmodel_dequantize((const kpu_model_dequantize_layer_argument_t *)layer_body, ctx);
+                break;
+        case KL_REQUANTIZE:
+                kpu_requantize((const kpu_model_requantize_layer_argument_t *)layer_body, ctx);
+                break;
+        case KL_L2_NORMALIZATION:
+                kpu_l2_normalization((const kpu_model_l2_norm_layer_argument_t *)layer_body, ctx);
+                break;
+        case KL_SOFTMAX:
+                kpu_softmax((const kpu_model_softmax_layer_argument_t *)layer_body, ctx);
+                break;
+        case KL_CONCAT:
+        case KL_QUANTIZED_CONCAT:
+                kpu_concat((const kpu_model_concat_layer_argument_t *)layer_body, ctx);
+                break;
+        case KL_FULLY_CONNECTED:
+                kpu_kmodel_fully_connected((const kpu_model_fully_connected_layer_argument_t *)layer_body, ctx);
+                break;
+        case KL_TENSORFLOW_FLATTEN:
+                kpu_tf_flatten((const kpu_model_tf_flatten_layer_argument_t *)layer_body, ctx);
+                break;
+        case KL_RESIZE_NEAREST_NEIGHBOR:
+                kpu_resize_nearest_neighbor((const kpu_model_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
+                break;
+        case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
+                kpu_quant_resize_nearest_neighbor((const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
+                break;
+        case KL_CHANNELWISE_DEQUANTIZE:
+                kpu_kmodel_channelwise_dequantize((const kpu_model_channelwise_dequant_argument_t *)layer_body, ctx);
+                break;
+        case KL_LOGISTIC:
+                kpu_logistic((const kpu_model_logistic_layer_argument_t *)layer_body, ctx);
+                break;
+        case KL_K210_CONV:
+                kpu_conv((const kpu_model_conv_layer_argument_t *)layer_body, ctx);
+                return 0;
+        case KL_K210_ADD_PADDING:
+                kpu_add_padding((const kpu_model_add_padding_layer_argument_t *)layer_body, ctx);
+                break;
+        case KL_K210_REMOVE_PADDING:
+                kpu_remove_padding((const kpu_model_remove_padding_layer_argument_t *)layer_body, ctx);
+                break;
+        case KL_K210_UPLOAD:
+                kpu_upload((const kpu_model_upload_layer_argument_t *)layer_body, ctx);
+                break;
+        default:
+                assert(!"Layer is not supported.");
+                kpu_kmodel_done(ctx);
+                return -1;
+        }
+        if (ctx->current_layer < (ctx->layers_length - 1))
+                ai_step(userdata);
+        else
+                kpu_kmodel_done(ctx);
+        return 0;
+}
 static void ai_step_not_isr(void *userdata)
+{
+    sysctl_disable_irq();
+    ai_step(userdata);
+    sysctl_enable_irq();
+        dis_int(INTNO_DMAAI);
+        dis_int(INTNO_AI);
+        ai_step(userdata);
+        ena_int(INTNO_DMAAI);
+        ena_int(INTNO_AI);
+}
 int kpu_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
+{
+    if(ctx->is_nncase)
+        return -1;
+    ctx->dma_ch = dma_ch;
+    ctx->done_callback = done_callback;
+    ctx->userdata = userdata;
+    ctx->current_layer = 0;
+    ctx->current_body = ctx->body_start;
+        ctx->dma_ch = dma_ch;
+        ctx->done_callback = done_callback;
+        ctx->userdata = userdata;
+        ctx->current_layer = 0;
+        ctx->current_body = ctx->body_start;
 #if KPU_DEBUG
     last_time = 0;
     total_time = 0;
     kpu_time = 0;
+        last_time = 0;
+        total_time = 0;
+        kpu_time = 0;
 #endif
+    kpu_kmodel_header_t *header = (kpu_kmodel_header_t *)ctx->model_buffer;
+    kpu->interrupt_clear.reg = 7;
+    kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
+        .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
+    kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
+        .eight_bit_mode = header->flags & 1};
+    kpu->interrupt_mask.data = (kpu_config_interrupt_t){
+        .calc_done_int = 1,
+        .layer_cfg_almost_empty_int = 0,
+        .layer_cfg_almost_full_int = 1};
+    plic_set_priority(INTNO_AI, 1);
+    plic_irq_register(INTNO_AI, ai_step, ctx);
+    plic_irq_enable(INTNO_AI);
+    const kpu_model_layer_header_t *first_layer_header = ctx->layer_headers;
+    switch(first_layer_header->type)
+    {
+        case KL_K210_CONV:
+        {
+            const kpu_model_conv_layer_argument_t *first_layer = (const kpu_model_conv_layer_argument_t *)ctx->body_start;
+            kpu_layer_argument_t layer_arg = *(volatile kpu_layer_argument_t *)(ctx->model_buffer + first_layer->layer_offset);
+            if((layer_arg.image_size.data.i_row_wid + 1) % 64 != 0)
+            {
+                kpu_kmodel_input_with_padding(&layer_arg, src);
+                ai_step_not_isr(ctx);
+            } else
+            {
+                kpu_input_dma(&layer_arg, src, ctx->dma_ch, ai_step, ctx);
+            }
+        }
+        break;
+        case KL_FULLY_CONNECTED:
+        {
+            const kpu_model_fully_connected_layer_argument_t *first_layer = (const kpu_model_fully_connected_layer_argument_t *)ctx->body_start;
+            kpu_kmodel_input_float((const float *)src, (float *)(ctx->main_buffer + first_layer->main_mem_in_address), first_layer->in_channels);
+            ai_step_not_isr(ctx);
+        }
+        break;
+        default:
+            return -1;
+    }
+    return 0;
+}
+        kpu_kmodel_header_t *header = (kpu_kmodel_header_t *)ctx->model_buffer;
+        kpu->interrupt_clear.reg = 7;
+        kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
+                .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
+        kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
+                .eight_bit_mode = header->flags & 1};
+        kpu->interrupt_mask.data = (kpu_config_interrupt_t){
+                .calc_done_int = 1,
+                .layer_cfg_almost_empty_int = 0,
+                .layer_cfg_almost_full_int = 1};
+        //plic_set_priority(INTNO_AI, 1);
+        plic_irq_register(INTNO_AI, ai_step, ctx);
+        plic_irq_enable(INTNO_AI);
+        const kpu_model_layer_header_t *first_layer_header = ctx->layer_headers;
+        switch (first_layer_header->type)
+        {
+        case KL_K210_CONV:
+        {
+                const kpu_model_conv_layer_argument_t *first_layer = (const kpu_model_conv_layer_argument_t *)ctx->body_start;
+                kpu_layer_argument_t layer_arg = *(volatile kpu_layer_argument_t *)(ctx->model_buffer + first_layer->layer_offset);
+                if ((layer_arg.image_size.data.i_row_wid + 1) % 64 != 0)
+                {
+                        kpu_kmodel_input_with_padding(&layer_arg, src);
+                        ai_step_not_isr(ctx);
+                }
+                else
+                {
+                        kpu_input_dma(&layer_arg, src, ctx->dma_ch, ai_step, ctx);
+                }
+        }
+        break;
+        case KL_FULLY_CONNECTED:
+        {
+                const kpu_model_fully_connected_layer_argument_t *first_layer = (const kpu_model_fully_connected_layer_argument_t *)ctx->body_start;
+                kpu_kmodel_input_float((const float *)src, (float *)(ctx->main_buffer + first_layer->main_mem_in_address), first_layer->in_channels);
+                ai_step_not_isr(ctx);
+        }
+        break;
+        default:
+                return -1;
+        }
+        return 0;
+}
+ER kpu_init(kpu_model_context_t *ctx)
+{
+        g_ai_hdma.chnum = AI_DMA_CH;
+        g_ai_hdma.xfercallback = ai_dma_done_isr;
+        g_ai_hdma.errorcallback = NULL;
+        g_ai_hdma.Init.Request = DMA_SELECT_AI_RX_REQ;          /* DMA選択 */
+        g_ai_hdma.Init.Direction = DMA_PERIPH_TO_MEMORY;        /* DMA転送方向 */
+        g_ai_hdma.Init.SrcMultBlock = DMAC_MULTBLOCK_CONT;      /* ソースマルチブロックタイプ */
+        g_ai_hdma.Init.DrcMultBlock = DMAC_MULTBLOCK_CONT;      /* デスティネーションマルチブロックタイプ */
+        g_ai_hdma.Init.SrcHandShake = DMAC_HS_HARDWARE;         /* ソースハンドシェイク */
+        g_ai_hdma.Init.DrcHandShake = DMAC_HS_SOFTWARE;         /* デスティネーションハンドシェイク */
+        g_ai_hdma.Init.SrcHwhsPol = DMAC_HWHS_POLARITY_LOW; /* ソースハードウェアハンドシェイク極性 */
+        g_ai_hdma.Init.DrcHwhsPol = DMAC_HWHS_POLARITY_LOW; /* デスティネーションハードウェアハンドシェイク極性 */
+        g_ai_hdma.Init.Priority = 4;                                            /* 優先度 */
+        g_ai_hdma.Init.SrcMaster = DMAC_MASTER1;                        /* ソースマスター設定 */
+        g_ai_hdma.Init.DstMaster = DMAC_MASTER2;                        /* デスティネーションマスター設定 */
+        g_ai_hdma.Init.SrcInc = DMAC_ADDR_NOCHANGE;                     /* ソースインクリメント設定 */
+        g_ai_hdma.Init.DstInc = DMAC_ADDR_INCREMENT;            /* デスティネーションインクリメント設定 */
+        g_ai_hdma.Init.SrcTransWidth = DMAC_TRANS_WIDTH_32; /* ソース転送幅 */
+        g_ai_hdma.Init.DstTransWidth = DMAC_TRANS_WIDTH_32; /* デスティネーション転送幅 */
+        g_ai_hdma.Init.SrcBurstSize = DMAC_MSIZE_4;                     /* ソースバーストサイズ */
+        g_ai_hdma.Init.DstBurstSize = DMAC_MSIZE_4;                     /* デスティネーションバーストサイズ */
+        g_ai_hdma.Init.IocBlkTrans = 0;                                         /* IOCブロック転送 */
+        g_ai_hdma.localdata = (void *)ctx;
+        return dma_init(&g_ai_hdma);
+}

azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/kpu.h

-              r453
+              r458
 #endif
-#define kpu_matmul_begin kpu_conv2d_output
 #define IOMEM 0x40000000
+#define dmac_channel_number_t int
 typedef int (*plic_irq_callback_t)(void *ctx);
 …
 typedef struct
+{
     union
+    {
         uint64_t reg;
         struct
+        {
             uint64_t int_en : 1;
             uint64_t ram_flag : 1;
             uint64_t full_add : 1;
             uint64_t depth_wise_layer : 1;
             uint64_t reserved : 60;
         } data;
     } interrupt_enabe;
     union
+    {
         uint64_t reg;
         struct
+        {
             uint64_t image_src_addr : 15;
             uint64_t reserved0 : 17;
             uint64_t image_dst_addr : 15;
             uint64_t reserved1 : 17;
         } data;
     } image_addr;
     union
+    {
         uint64_t reg;
         struct
+        {
             uint64_t i_ch_num : 10;
             uint64_t reserved0 : 22;
             uint64_t o_ch_num : 10;
             uint64_t reserved1 : 6;
             uint64_t o_ch_num_coef : 10;
             uint64_t reserved2 : 6;
         } data;
     } image_channel_num;
     union
+    {
         uint64_t reg;
         struct
+        {
             uint64_t i_row_wid : 10;
             uint64_t i_col_high : 9;
             uint64_t reserved0 : 13;
             uint64_t o_row_wid : 10;
             uint64_t o_col_high : 9;
             uint64_t reserved1 : 13;
         } data;
     } image_size;
     union
+    {
         uint64_t reg;
         struct
+        {
             uint64_t kernel_type : 3;
             uint64_t pad_type : 1;
             uint64_t pool_type : 4;
             uint64_t first_stride : 1;
             uint64_t bypass_conv : 1;
             uint64_t load_para : 1;
             uint64_t reserved0 : 5;
             uint64_t dma_burst_size : 8;
             uint64_t pad_value : 8;
             uint64_t bwsx_base_addr : 32;
         } data;
     } kernel_pool_type_cfg;
     union
+    {
         uint64_t reg;
         struct
+        {
             uint64_t load_coor : 1;
             uint64_t load_time : 6;
             uint64_t reserved0 : 8;
             uint64_t para_size : 17;
             uint64_t para_start_addr : 32;
         } data;
     } kernel_load_cfg;
     union
+    {
         uint64_t reg;
         struct
+        {
             uint64_t coef_column_offset : 4;
             uint64_t coef_row_offset : 12;
             uint64_t reserved0 : 48;
         } data;
     } kernel_offset;
     union
+    {
         uint64_t reg;
         struct
+        {
             uint64_t channel_switch_addr : 15;
             uint64_t reserved : 1;
             uint64_t row_switch_addr : 4;
             uint64_t coef_size : 8;
             uint64_t coef_group : 3;
             uint64_t load_act : 1;
             uint64_t active_addr : 32;
         } data;
     } kernel_calc_type_cfg;
     union
+    {
         uint64_t reg;
         struct
+        {
             uint64_t wb_channel_switch_addr : 15;
             uint64_t reserved0 : 1;
             uint64_t wb_row_switch_addr : 4;
             uint64_t wb_group : 3;
             uint64_t reserved1 : 41;
         } data;
     } write_back_cfg;
     union
+    {
         uint64_t reg;
         struct
+        {
             uint64_t shr_w : 4;
             uint64_t shr_x : 4;
             uint64_t arg_w : 24;
             uint64_t arg_x : 24;
             uint64_t reserved0 : 8;
         } data;
     } conv_value;
     union
+    {
         uint64_t reg;
         struct
+        {
             uint64_t arg_add : 40;
             uint64_t reserved : 24;
         } data;
     } conv_value2;
     union
+    {
         uint64_t reg;
         struct
+        {
             uint64_t send_data_out : 1;
             uint64_t reserved : 15;
             uint64_t channel_byte_num : 16;
             uint64_t dma_total_byte : 32;
         } data;
     } dma_parameter;
+        union
+        {
+                uint64_t reg;
+                struct
+                {
+                        uint64_t int_en : 1;
+                        uint64_t ram_flag : 1;
+                        uint64_t full_add : 1;
+                        uint64_t depth_wise_layer : 1;
+                        uint64_t reserved : 60;
+                } data;
+        } interrupt_enabe;
+        union
+        {
+                uint64_t reg;
+                struct
+                {
+                        uint64_t image_src_addr : 15;
+                        uint64_t reserved0 : 17;
+                        uint64_t image_dst_addr : 15;
+                        uint64_t reserved1 : 17;
+                } data;
+        } image_addr;
+        union
+        {
+                uint64_t reg;
+                struct
+                {
+                        uint64_t i_ch_num : 10;
+                        uint64_t reserved0 : 22;
+                        uint64_t o_ch_num : 10;
+                        uint64_t reserved1 : 6;
+                        uint64_t o_ch_num_coef : 10;
+                        uint64_t reserved2 : 6;
+                } data;
+        } image_channel_num;
+        union
+        {
+                uint64_t reg;
+                struct
+                {
+                        uint64_t i_row_wid : 10;
+                        uint64_t i_col_high : 9;
+                        uint64_t reserved0 : 13;
+                        uint64_t o_row_wid : 10;
+                        uint64_t o_col_high : 9;
+                        uint64_t reserved1 : 13;
+                } data;
+        } image_size;
+        union
+        {
+                uint64_t reg;
+                struct
+                {
+                        uint64_t kernel_type : 3;
+                        uint64_t pad_type : 1;
+                        uint64_t pool_type : 4;
+                        uint64_t first_stride : 1;
+                        uint64_t bypass_conv : 1;
+                        uint64_t load_para : 1;
+                        uint64_t reserved0 : 5;
+                        uint64_t dma_burst_size : 8;
+                        uint64_t pad_value : 8;
+                        uint64_t bwsx_base_addr : 32;
+                } data;
+        } kernel_pool_type_cfg;
+        union
+        {
+                uint64_t reg;
+                struct
+                {
+                        uint64_t load_coor : 1;
+                        uint64_t load_time : 6;
+                        uint64_t reserved0 : 8;
+                        uint64_t para_size : 17;
+                        uint64_t para_start_addr : 32;
+                } data;
+        } kernel_load_cfg;
+        union
+        {
+                uint64_t reg;
+                struct
+                {
+                        uint64_t coef_column_offset : 4;
+                        uint64_t coef_row_offset : 12;
+                        uint64_t reserved0 : 48;
+                } data;
+        } kernel_offset;
+        union
+        {
+                uint64_t reg;
+                struct
+                {
+                        uint64_t channel_switch_addr : 15;
+                        uint64_t reserved : 1;
+                        uint64_t row_switch_addr : 4;
+                        uint64_t coef_size : 8;
+                        uint64_t coef_group : 3;
+                        uint64_t load_act : 1;
+                        uint64_t active_addr : 32;
+                } data;
+        } kernel_calc_type_cfg;
+        union
+        {
+                uint64_t reg;
+                struct
+                {
+                        uint64_t wb_channel_switch_addr : 15;
+                        uint64_t reserved0 : 1;
+                        uint64_t wb_row_switch_addr : 4;
+                        uint64_t wb_group : 3;
+                        uint64_t reserved1 : 41;
+                } data;
+        } write_back_cfg;
+        union
+        {
+                uint64_t reg;
+                struct
+                {
+                        uint64_t shr_w : 4;
+                        uint64_t shr_x : 4;
+                        uint64_t arg_w : 24;
+                        uint64_t arg_x : 24;
+                        uint64_t reserved0 : 8;
+                } data;
+        } conv_value;
+        union
+        {
+                uint64_t reg;
+                struct
+                {
+                        uint64_t arg_add : 40;
+                        uint64_t reserved : 24;
+                } data;
+        } conv_value2;
+        union
+        {
+                uint64_t reg;
+                struct
+                {
+                        uint64_t send_data_out : 1;
+                        uint64_t reserved : 15;
+                        uint64_t channel_byte_num : 16;
+                        uint64_t dma_total_byte : 32;
+                } data;
+        } dma_parameter;
 } kpu_layer_argument_t;
 typedef struct
+{
     union
+    {
         uint64_t reg;
         struct
+        {
             uint64_t shift_number : 8;
             uint64_t y_mul : 16;
             uint64_t x_start : 36;
         } data;
     } activate_para[16];
     union
+    {
         uint64_t reg;
         struct
+        {
             uint8_t result_bias[8];
         } data;
     } activate_para_bias0;
     union
+    {
         uint64_t reg;
         struct
+        {
             uint8_t result_bias[8];
         } data;
     } activate_para_bias1;
+        union
+        {
+                uint64_t reg;
+                struct
+                {
+                        uint64_t shift_number : 8;
+                        uint64_t y_mul : 16;
+                        uint64_t x_start : 36;
+                } data;
+        } activate_para[16];
+        union
+        {
+                uint64_t reg;
+                struct
+                {
+                        uint8_t result_bias[8];
+                } data;
+        } activate_para_bias0;
+        union
+        {
+                uint64_t reg;
+                struct
+                {
+                        uint8_t result_bias[8];
+                } data;
+        } activate_para_bias1;
 } kpu_activate_table_t;
 typedef struct
+{
     union
+    {
         uint64_t reg;
         struct
+        {
             uint64_t norm_mul : 24;
             uint64_t norm_add : 32;
             uint64_t norm_shift : 4;
         } data;
     } batchnorm;
+        union
+        {
+                uint64_t reg;
+                struct
+                {
+                        uint64_t norm_mul : 24;
+                        uint64_t norm_add : 32;
+                        uint64_t norm_shift : 4;
+                } data;
+        } batchnorm;
 } kpu_batchnorm_argument_t;
 typedef struct
+{
     union
+    {
         uint64_t reg;
         struct
+        {
             uint16_t weight[9];
         } data;
     } weights;
+        union
+        {
+                uint64_t reg;
+                struct
+                {
+                        uint16_t weight[9];
+                } data;
+        } weights;
 } kpu_weights_kernel_16_3x3_t;
 typedef struct
+{
     uint64_t calc_done_int : 1;
     uint64_t layer_cfg_almost_empty_int : 1;
     uint64_t layer_cfg_almost_full_int : 1;
     uint64_t reserved : 61;
+        uint64_t calc_done_int : 1;
+        uint64_t layer_cfg_almost_empty_int : 1;
+        uint64_t layer_cfg_almost_full_int : 1;
+        uint64_t reserved : 61;
 } kpu_config_interrupt_t;
 typedef struct
+{
     uint64_t fifo_full_threshold : 4;
     uint64_t fifo_empty_threshold : 4;
     uint64_t reserved : 56;
+        uint64_t fifo_full_threshold : 4;
+        uint64_t fifo_empty_threshold : 4;
+        uint64_t reserved : 56;
 } kpu_config_fifo_threshold_t;
 typedef struct
+{
     uint64_t dma_fifo_flush_n : 1;
     uint64_t gs_fifo_flush_n : 1;
     uint64_t cfg_fifo_flush_n : 1;
     uint64_t cmd_fifo_flush_n : 1;
     uint64_t resp_fifo_flush_n : 1;
     uint64_t reserved : 59;
+        uint64_t dma_fifo_flush_n : 1;
+        uint64_t gs_fifo_flush_n : 1;
+        uint64_t cfg_fifo_flush_n : 1;
+        uint64_t cmd_fifo_flush_n : 1;
+        uint64_t resp_fifo_flush_n : 1;
+        uint64_t reserved : 59;
 } kpu_config_fifo_ctrl_t;
 typedef struct
+{
     uint64_t eight_bit_mode : 1;
     uint64_t reserved : 63;
+        uint64_t eight_bit_mode : 1;
+        uint64_t reserved : 63;
 } kpu_config_eight_bit_mode_t;
 typedef struct
+{
     volatile uint64_t layer_argument_fifo;
     volatile union
+    {
         uint64_t reg;
         kpu_config_interrupt_t data;
     } interrupt_status;
     volatile union
+    {
         uint64_t reg;
         kpu_config_interrupt_t data;
     } interrupt_raw;
     volatile union
+    {
         uint64_t reg;
         kpu_config_interrupt_t data;
     } interrupt_mask;
     volatile union
+    {
         uint64_t reg;
         kpu_config_interrupt_t data;
     } interrupt_clear;
     volatile union
+    {
         uint64_t reg;
         kpu_config_fifo_threshold_t data;
     } fifo_threshold;
     volatile uint64_t fifo_data_out;
     volatile union
+    {
         uint64_t reg;
         kpu_config_fifo_ctrl_t data;
     } fifo_ctrl;
     volatile union
+    {
         uint64_t reg;
         kpu_config_eight_bit_mode_t data;
     } eight_bit_mode;
+        volatile uint64_t layer_argument_fifo;
+        volatile union
+        {
+                uint64_t reg;
+                kpu_config_interrupt_t data;
+        } interrupt_status;
+        volatile union
+        {
+                uint64_t reg;
+                kpu_config_interrupt_t data;
+        } interrupt_raw;
+        volatile union
+        {
+                uint64_t reg;
+                kpu_config_interrupt_t data;
+        } interrupt_mask;
+        volatile union
+        {
+                uint64_t reg;
+                kpu_config_interrupt_t data;
+        } interrupt_clear;
+        volatile union
+        {
+                uint64_t reg;
+                kpu_config_fifo_threshold_t data;
+        } fifo_threshold;
+        volatile uint64_t fifo_data_out;
+        volatile union
+        {
+                uint64_t reg;
+                kpu_config_fifo_ctrl_t data;
+        } fifo_ctrl;
+        volatile union
+        {
+                uint64_t reg;
+                kpu_config_eight_bit_mode_t data;
+        } eight_bit_mode;
 } kpu_config_t;
+#define dmac_channel_number_t int
+typedef struct
+{
+    kpu_layer_argument_t *layers;
+    kpu_layer_argument_t *remain_layers;
+    plic_irq_callback_t callback;
+    void *ctx;
+    uint64_t *src;
+    uint64_t *dst;
+    uint32_t src_length;
+    uint32_t dst_length;
+    uint32_t layers_length;
+    uint32_t remain_layers_length;
+    dmac_channel_number_t dma_ch;
+    uint32_t eight_bit_mode;
+    float output_scale;
+    float output_bias;
+    float input_scale;
+    float input_bias;
+} kpu_task_t;
+typedef struct
+{
+    uint32_t version;
+    uint32_t flags;
+    uint32_t arch;
+    uint32_t layers_length;
+    uint32_t max_start_address;
+    uint32_t main_mem_usage;
+    uint32_t output_count;
+typedef struct
+{
+        uint32_t version;
+        uint32_t flags;
+        uint32_t arch;
+        uint32_t layers_length;
+        uint32_t max_start_address;
+        uint32_t main_mem_usage;
+        uint32_t output_count;
 } kpu_kmodel_header_t;
 typedef struct
+{
     uint32_t version;
     uint32_t flags;
     uint32_t layers_length;
     uint32_t max_start_address;
     uint32_t layers_argument_start;
+        uint32_t version;
+        uint32_t flags;
+        uint32_t layers_length;
+        uint32_t max_start_address;
+        uint32_t layers_argument_start;
 } kpu_model_header_t;
 typedef struct
+{
     uint32_t address;
     uint32_t size;
+        uint32_t address;
+        uint32_t size;
 } kpu_model_output_t;
 typedef enum
+{
     KL_INVALID = 0,
     KL_ADD,
     KL_QUANTIZED_ADD,
     KL_GLOBAL_MAX_POOL2D,
     KL_QUANTIZED_GLOBAL_MAX_POOL2D,
     KL_GLOBAL_AVERAGE_POOL2D,
     KL_QUANTIZED_GLOBAL_AVERAGE_POOL2D,
     KL_MAX_POOL2D,
     KL_QUANTIZED_MAX_POOL2D,
     KL_AVERAGE_POOL2D,
     KL_QUANTIZED_AVERAGE_POOL2D,
     KL_QUANTIZE,
     KL_DEQUANTIZE,
     KL_REQUANTIZE,
     KL_L2_NORMALIZATION,
     KL_SOFTMAX,
     KL_CONCAT,
     KL_QUANTIZED_CONCAT,
     KL_FULLY_CONNECTED,
     KL_QUANTIZED_FULLY_CONNECTED,
     KL_TENSORFLOW_FLATTEN,
     KL_QUANTIZED_TENSORFLOW_FLATTEN,
     KL_RESIZE_NEAREST_NEIGHBOR,
     KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR,
     KL_CHANNELWISE_DEQUANTIZE,
     KL_LOGISTIC,
     KL_K210_CONV = 10240,
     KL_K210_ADD_PADDING,
     KL_K210_REMOVE_PADDING,
     KL_K210_UPLOAD
+        KL_INVALID = 0,
+        KL_ADD,
+        KL_QUANTIZED_ADD,
+        KL_GLOBAL_MAX_POOL2D,
+        KL_QUANTIZED_GLOBAL_MAX_POOL2D,
+        KL_GLOBAL_AVERAGE_POOL2D,
+        KL_QUANTIZED_GLOBAL_AVERAGE_POOL2D,
+        KL_MAX_POOL2D,
+        KL_QUANTIZED_MAX_POOL2D,
+        KL_AVERAGE_POOL2D,
+        KL_QUANTIZED_AVERAGE_POOL2D,
+        KL_QUANTIZE,
+        KL_DEQUANTIZE,
+        KL_REQUANTIZE,
+        KL_L2_NORMALIZATION,
+        KL_SOFTMAX,
+        KL_CONCAT,
+        KL_QUANTIZED_CONCAT,
+        KL_FULLY_CONNECTED,
+        KL_QUANTIZED_FULLY_CONNECTED,
+        KL_TENSORFLOW_FLATTEN,
+        KL_QUANTIZED_TENSORFLOW_FLATTEN,
+        KL_RESIZE_NEAREST_NEIGHBOR,
+        KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR,
+        KL_CHANNELWISE_DEQUANTIZE,
+        KL_LOGISTIC,
+        KL_K210_CONV = 10240,
+        KL_K210_ADD_PADDING,
+        KL_K210_REMOVE_PADDING,
+        KL_K210_UPLOAD
 } kpu_model_layer_type_t;
 typedef struct
+{
     uint32_t type;
     uint32_t body_size;
+        uint32_t type;
+        uint32_t body_size;
 } kpu_model_layer_header_t;
 typedef enum
+{
     KLF_NONE = 0,
     KLF_MAIN_MEM_OUT = 1
+        KLF_NONE = 0,
+        KLF_MAIN_MEM_OUT = 1
 } kpu_model_layer_flags_t;
 typedef enum
+{
     KLP_SAME = 0,
     KLP_VALID = 1
+        KLP_SAME = 0,
+        KLP_VALID = 1
 } kpu_model_padding_t;
 typedef enum
+{
     KLA_LINEAR = 0,
     KLA_RELU = 1,
     KLA_RELU6 = 2
+        KLA_LINEAR = 0,
+        KLA_RELU = 1,
+        KLA_RELU6 = 2
 } kpu_model_activation_t;
 typedef struct
+{
     float scale;
     float bias;
+        float scale;
+        float bias;
 } kpu_model_quant_param_t;
 typedef struct
+{
     uint32_t width;
     uint32_t height;
     uint32_t channels;
+        uint32_t width;
+        uint32_t height;
+        uint32_t channels;
 } kpu_model_shape_t;
 typedef struct
+{
     uint32_t start;
     uint32_t size;
+        uint32_t start;
+        uint32_t size;
 } kpu_model_memory_range_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_out_address;
     uint32_t layer_offset;
     uint32_t weights_offset;
     uint32_t bn_offset;
     uint32_t act_offset;
+        uint32_t flags;
+        uint32_t main_mem_out_address;
+        uint32_t layer_offset;
+        uint32_t weights_offset;
+        uint32_t bn_offset;
+        uint32_t act_offset;
 } kpu_model_conv_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_a_address;
     uint32_t main_mem_in_b_address;
     uint32_t main_mem_out_address;
     uint32_t count;
+        uint32_t flags;
+        uint32_t main_mem_in_a_address;
+        uint32_t main_mem_in_b_address;
+        uint32_t main_mem_out_address;
+        uint32_t count;
 } kpu_model_add_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_a_address;
     uint32_t main_mem_in_b_address;
     uint32_t main_mem_out_address;
     uint32_t count;
     int32_t in_a_offset;
     int32_t in_a_mul;
     int32_t in_a_shift;
     int32_t in_b_offset;
     int32_t in_b_mul;
     int32_t in_b_shift;
     int32_t out_offset;
     int32_t out_mul;
     int32_t out_shift;
+        uint32_t flags;
+        uint32_t main_mem_in_a_address;
+        uint32_t main_mem_in_b_address;
+        uint32_t main_mem_out_address;
+        uint32_t count;
+        int32_t in_a_offset;
+        int32_t in_a_mul;
+        int32_t in_a_shift;
+        int32_t in_b_offset;
+        int32_t in_b_mul;
+        int32_t in_b_shift;
+        int32_t out_offset;
+        int32_t out_mul;
+        int32_t out_shift;
 } kpu_model_quant_add_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_address;
     uint32_t main_mem_out_address;
     uint32_t kernel_size;
     uint32_t channels;
+        uint32_t flags;
+        uint32_t main_mem_in_address;
+        uint32_t main_mem_out_address;
+        uint32_t kernel_size;
+        uint32_t channels;
 } kpu_model_gap2d_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_address;
     uint32_t main_mem_out_address;
     kpu_model_shape_t in_shape;
     kpu_model_shape_t out_shape;
     uint32_t kernel_width;
     uint32_t kernel_height;
     uint32_t stride_width;
     uint32_t stride_height;
     uint32_t padding_width;
     uint32_t padding_height;
+        uint32_t flags;
+        uint32_t main_mem_in_address;
+        uint32_t main_mem_out_address;
+        kpu_model_shape_t in_shape;
+        kpu_model_shape_t out_shape;
+        uint32_t kernel_width;
+        uint32_t kernel_height;
+        uint32_t stride_width;
+        uint32_t stride_height;
+        uint32_t padding_width;
+        uint32_t padding_height;
 } kpu_model_quant_max_pool2d_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_address;
     uint32_t main_mem_out_address;
     kpu_model_shape_t in_shape;
     kpu_model_shape_t out_shape;
     uint32_t kernel_width;
     uint32_t kernel_height;
     uint32_t stride_width;
     uint32_t stride_height;
     uint32_t padding_width;
     uint32_t padding_height;
     kpu_model_activation_t act;
+        uint32_t flags;
+        uint32_t main_mem_in_address;
+        uint32_t main_mem_out_address;
+        kpu_model_shape_t in_shape;
+        kpu_model_shape_t out_shape;
+        uint32_t kernel_width;
+        uint32_t kernel_height;
+        uint32_t stride_width;
+        uint32_t stride_height;
+        uint32_t padding_width;
+        uint32_t padding_height;
+        kpu_model_activation_t act;
 } kpu_model_ave_pool2d_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_address;
     uint32_t mem_out_address;
     uint32_t count;
     kpu_model_quant_param_t quant_param;
+        uint32_t flags;
+        uint32_t main_mem_in_address;
+        uint32_t mem_out_address;
+        uint32_t count;
+        kpu_model_quant_param_t quant_param;
 } kpu_model_quantize_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_address;
     uint32_t main_mem_out_address;
     uint32_t count;
     kpu_model_quant_param_t quant_param;
+        uint32_t flags;
+        uint32_t main_mem_in_address;
+        uint32_t main_mem_out_address;
+        uint32_t count;
+        kpu_model_quant_param_t quant_param;
 } kpu_model_dequantize_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_address;
     uint32_t main_mem_out_address;
     uint32_t count;
     uint8_t table[256];
+        uint32_t flags;
+        uint32_t main_mem_in_address;
+        uint32_t main_mem_out_address;
+        uint32_t count;
+        uint8_t table[256];
 } kpu_model_requantize_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_address;
     uint32_t kpu_mem_out_address;
     uint32_t channels;
+        uint32_t flags;
+        uint32_t main_mem_in_address;
+        uint32_t kpu_mem_out_address;
+        uint32_t channels;
 } kpu_model_add_padding_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_address;
     uint32_t main_mem_out_address;
     uint32_t channels;
+        uint32_t flags;
+        uint32_t main_mem_in_address;
+        uint32_t main_mem_out_address;
+        uint32_t channels;
 } kpu_model_remove_padding_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_address;
     uint32_t kpu_mem_out_address;
     uint32_t width;
     uint32_t height;
     uint32_t channels;
+        uint32_t flags;
+        uint32_t main_mem_in_address;
+        uint32_t kpu_mem_out_address;
+        uint32_t width;
+        uint32_t height;
+        uint32_t channels;
 } kpu_model_upload_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_address;
     uint32_t main_mem_out_address;
     uint32_t channels;
+        uint32_t flags;
+        uint32_t main_mem_in_address;
+        uint32_t main_mem_out_address;
+        uint32_t channels;
 } kpu_model_l2_norm_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_address;
     uint32_t main_mem_out_address;
     uint32_t channels;
+        uint32_t flags;
+        uint32_t main_mem_in_address;
+        uint32_t main_mem_out_address;
+        uint32_t channels;
 } kpu_model_softmax_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_out_address;
     uint32_t input_count;
     kpu_model_memory_range_t inputs_mem[0];
+        uint32_t flags;
+        uint32_t main_mem_out_address;
+        uint32_t input_count;
+        kpu_model_memory_range_t inputs_mem[0];
 } kpu_model_concat_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_address;
     uint32_t main_mem_out_address;
     uint32_t in_channels;
     uint32_t out_channels;
     kpu_model_activation_t act;
     float weights[0];
+        uint32_t flags;
+        uint32_t main_mem_in_address;
+        uint32_t main_mem_out_address;
+        uint32_t in_channels;
+        uint32_t out_channels;
+        kpu_model_activation_t act;
+        float weights[0];
 } kpu_model_fully_connected_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_address;
     uint32_t main_mem_out_address;
     kpu_model_shape_t shape;
+        uint32_t flags;
+        uint32_t main_mem_in_address;
+        uint32_t main_mem_out_address;
+        kpu_model_shape_t shape;
 } kpu_model_tf_flatten_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_address;
     uint32_t main_mem_out_address;
     kpu_model_shape_t in_shape;
     uint32_t out_width;
     uint32_t out_height;
     uint32_t align_corners;
+        uint32_t flags;
+        uint32_t main_mem_in_address;
+        uint32_t main_mem_out_address;
+        kpu_model_shape_t in_shape;
+        uint32_t out_width;
+        uint32_t out_height;
+        uint32_t align_corners;
 } kpu_model_resize_nearest_neighbor_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_address;
     uint32_t main_mem_out_address;
     kpu_model_shape_t in_shape;
     uint32_t out_width;
     uint32_t out_height;
     uint32_t align_corners;
+        uint32_t flags;
+        uint32_t main_mem_in_address;
+        uint32_t main_mem_out_address;
+        kpu_model_shape_t in_shape;
+        uint32_t out_width;
+        uint32_t out_height;
+        uint32_t align_corners;
 } kpu_model_quant_resize_nearest_neighbor_layer_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_address;
     uint32_t main_mem_out_address;
     uint32_t channels;
     uint32_t channel_size;
     kpu_model_quant_param_t quant_params[0];
+        uint32_t flags;
+        uint32_t main_mem_in_address;
+        uint32_t main_mem_out_address;
+        uint32_t channels;
+        uint32_t channel_size;
+        kpu_model_quant_param_t quant_params[0];
 } kpu_model_channelwise_dequant_argument_t;
 typedef struct
+{
     uint32_t flags;
     uint32_t main_mem_in_address;
     uint32_t main_mem_out_address;
     uint32_t channels;
+        uint32_t flags;
+        uint32_t main_mem_in_address;
+        uint32_t main_mem_out_address;
+        uint32_t channels;
 } kpu_model_logistic_layer_argument_t;
 …
 typedef struct
+{
+    int is_nncase;
+    union
+    {
+        struct
+        {
+            const uint8_t *model_buffer;
+            uint8_t *main_buffer;
+            uint32_t output_count;
+            const kpu_model_output_t *outputs;
+            const kpu_model_layer_header_t *layer_headers;
+            const uint8_t *body_start;
+            uint32_t layers_length;
+            volatile uint32_t current_layer;
+            const uint8_t *volatile current_body;
+            dmac_channel_number_t dma_ch;
+            kpu_done_callback_t done_callback;
+            void *userdata;
+        };
+        struct
+        {
+            void* nncase_ctx;
+        };
+    };
+        union
+        {
+                struct
+                {
+                        const uint8_t *model_buffer;
+                        uint8_t *main_buffer;
+                        uint32_t output_count;
+                        const kpu_model_output_t *outputs;
+                        const kpu_model_layer_header_t *layer_headers;
+                        const uint8_t *body_start;
+                        uint32_t layers_length;
+                        volatile uint32_t current_layer;
+                        const uint8_t *volatile current_body;
+                        dmac_channel_number_t dma_ch;
+                        kpu_done_callback_t done_callback;
+                        void *userdata;
+                };
+                struct
+                {
+                        void* nncase_ctx;
+                };
+        };
 } kpu_model_context_t;
 typedef struct
+{
     uint32_t weigths_offset;
     uint32_t bn_offset;
     uint32_t act_offset;
     float input_scale;
     float input_bias;
     float output_scale;
     float output_bias;
+        uint32_t weigths_offset;
+        uint32_t bn_offset;
+        uint32_t act_offset;
+        float input_scale;
+        float input_bias;
+        float output_scale;
+        float output_bias;
 } kpu_model_layer_metadata_t;
 typedef struct _quantize_param
+{
     float scale;
     float bias;
+        float scale;
+        float bias;
 } quantize_param_t;
 extern volatile kpu_config_t *const kpu;
+/**
+ * @brief       Modle complier init kpu handler
+ *
+ * @param[in]   task            Kpu handler
+ *
+ * @return      Kpu handler
+ */
+extern kpu_task_t *kpu_task_init(kpu_task_t *task);
+/**
+ * @brief       Kpu run for AI
+ *
+ * @param[in]   task                Kpu handler
+ * @param[in]   dma_ch              DMA for kpu
+ * @param[in]   src                 The picture data
+ * @param[in]   dest                The result of kpu
+ * @param[in]   callback            The callback of kpu
+ *
+ * @return      result
+ *     - 0      Success
+ *     - Other  Fail.Kpu is busy.
+ */
+int kpu_run(kpu_task_t *task, dmac_channel_number_t dma_ch, const void *src, void *dest, plic_irq_callback_t callback);
+/**
+ * @brief       Get kpu result buf
+ *
+ * @param[in]   task                Kpu handler
+ *
+ * @return      Kpu result buf
+ */
+uint8_t *kpu_get_output_buf(kpu_task_t *task);
+/**
+ * @brief       Release kpu output buf
+ *
+ * @param[in]   output_buf                Kpu output buf
+ *
+ */
+void kpu_release_output_buf(uint8_t *output_buf);
+/**
+ * @brief       Kpu run for AI
+ *
+ * @param[in]   task                Kpu handler
+*
+* @return      result
+*     - 0      Success
+*     - Other  Fail.Kpu is busy.
+*/
+int kpu_start(kpu_task_t *task);
+/**
+ * @brief      Initialize kpu handler
+ *
+ * @param[in]   task            Kpu handler
+ *
+ * @return      result
+ *     - 0      Success
+ *     - Other  Fail.
+ */
+int kpu_single_task_init(kpu_task_t *task);
+/**
+ * @brief      Uninitialize kpu handler
+ *
+ * @param[in]   task            Kpu handler
+ *
+ * @return      result
+ *     - 0      Success
+ *     - Other  Fail.
+ */
+int kpu_single_task_deinit(kpu_task_t *task);
+/**
+ * @brief      Load kmodel and init kpu task
+ *
+ * @param[in]   task            Kpu handler
+ * @param[in]   buffer          Kmodel
+ * @param[in]   meta            Test data
+ *
+ * @return      result
+ *     - 0      Success
+ *     - Other  Fail.
+ */
+int kpu_model_load_from_buffer(kpu_task_t *task, uint8_t *buffer, kpu_model_layer_metadata_t **meta);
+/**
+ * @brief       Kpu initialize
+ *
+ * @param[in]   eight_bit_mode            0:16bit mode  1:8bit mode
+ * @param[in]   callback                  Callback of kpu
+ * @param[in]   userdata                  Data of callback
+ *
+ */
+void kpu_init(int eight_bit_mode, plic_irq_callback_t callback, void *userdata);
+/**
+ * @brief       Kpu input data by dma
+ *
+ * @param[in]   layer                   Kpu task layer
+ * @param[in]   src                     Image data
+ * @param[in]   dma_ch                  Dmac channel
+ * @param[in]   callback                Dmac complete callback
+ * @param[in]   userdata                Data of callback
+ *
+ */
+void kpu_input_dma(const kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata);
+/**
+ * @brief       Kpu input data by cpu
+ *
+ * @param[in]   layer                   Kpu task layer
+ * @param[in]   src                     Image data
+ * @param[in]   width                   Image width
+ * @param[in]   height                  Image heigth
+ * @param[in]   channels                Color channel, RGB is 3
+ *
+ */
+void kpu_input_with_padding(kpu_layer_argument_t *layer, const uint8_t *src, int width, int height, int channels);
+/**
+ * @brief       Kpu run only one layer
+ *
+ * @param[in]   layer                   Kpu task layer
+ *
+ */
+void kpu_conv2d(kpu_layer_argument_t *layer);
+/**
+ * @brief       Kpu run only one layer then get the result by dma
+ *
+ * @param[in]   layer                   Kpu task layer
+ * @param[in]   dma_ch                  Dmac channel
+ * @param[in]   dest                    Result
+ * @param[in]   callback                Dmac complete callback
+ * @param[in]   userdata                Data of callback
+ *
+ */
+void kpu_conv2d_output(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata);
+/**
+ * @brief       Kpu pooling
+ *
+ * @param[in]   src                        Source
+ * @param[in]   src_param                  Source param
+ * @param[in]   kernel_size                Kernel size, 7*7 is 49
+ * @param[in]   channels                   Channels
+ * @param[in]   dest                       Dest
+ * @param[in]   dest_param                 Dest param
+ *
+ */
+void kpu_global_average_pool(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, uint8_t *dest, const quantize_param_t *dest_param);
+/**
+ * @brief       Kpu pooling
+ *
+ * @param[in]   src                        Source
+ * @param[in]   src_param                  Source param
+ * @param[in]   kernel_size                Kernel size, 7*7 is 49
+ * @param[in]   channels                   Channels
+ * @param[in]   dest                       Dest
+ *
+ */
+void kpu_global_average_pool_float(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, float *dest);
+/**
+ * @brief       Kpu fullly connected by cpu
+ *
+ * @param[in]   src                                 Source
+ * @param[in]   weights                             Weight
+ * @param[in]   biases                              Biases
+ * @param[in]   dest                                Dest
+ * @param[in]   input_channels                      Input channels
+ * @param[in]   output_channels                     Output channels
+ *
+ */
+void kpu_fully_connected(const float *src, const float *weights, const float *biases, float *dest, int input_channels, int output_channels);
+/**
+ * @brief       Kpu matrix multiplication
+ *
+ * @param[in]   src                                 Source
+ * @param[in]   channels                            Channels
+ * @param[in]   dest                                Dest
+ * @param[in]   dest_param                          Dest param
+ *
+ */
+void kpu_matmul_end(const uint8_t *src, int channels, float *dest, const quantize_param_t *dest_param);
+/**
+ * @brief       Kpu dequantize
+ *
+ * @param[in]   src                                 Source
+ * @param[in]   src_param                           Source param
+ * @param[in]   count                               Dequantize count
+ * @param[in]   dest                                Dest
+ *
+ */
+void kpu_dequantize(const uint8_t *src, const quantize_param_t *src_param, size_t count, float *dest);
+/**
+ * @brief       Kpu load kmodel
+ *
+ * @param[in]   ctx                                 Kmodel object
+ * @param[in]   buffer                              Kmodel buffer
+ *
+ * @return      result
+ *     - 0      Success
+ *     - Other  Fail.
+ */
+ER kpu_init(kpu_model_context_t *ctx);
 int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer);
-/**
- * @brief       Kpu free kmodel buffer
+ *
- * @param[in]   ctx                                 kmodel object
+ *
- */
-void kpu_model_free(kpu_model_context_t *ctx);
-/**
- * @brief       Kpu load kmodel
+ *
- * @param[in]   ctx                                 Kmodel object
- * @param[in]   index                               Output index
- * @param[in]   data                                Output data
- * @param[in]   size                                Output data size
+ *
- * @return      result
- *     - 0      Success
- *     - Other  Fail.
- */
 int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size);
-/**
- * @brief       Kpu run kmodel
+ *
- * @param[in]   ctx                                 Kmodel object
- * @param[in]   src                                 Source data
- * @param[in]   dma_ch                              Dma channel
- * @param[in]   done_callback                       Kpu complete callback
- * @param[in]   userdata                            Data of callback
+ *
- * @return      result
- *     - 0      Success
- *     - Other  Fail.
- */
 int kpu_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata);
+ER kpu_wait_done(kpu_model_context_t *ctx, TMO tmout);
 #ifdef __cplusplus

azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/region_layer.c

-              r453
+              r458
 typedef struct
+{
     float x;
     float y;
     float w;
     float h;
+        float x;
+        float y;
+        float w;
+        float h;
 } box_t;
 typedef struct
+{
     int index;
     int class;
     float **probs;
+        int index;
+        int class;
+        float **probs;
 } sortable_box_t;
 …
 int region_layer_init(region_layer_t *rl, int width, int height, int channels, int origin_width, int origin_height)
+{
     int flag = 0;
     rl->coords = 4;
     rl->image_width = 320;
     rl->image_height = 240;
     rl->classes = channels / 5 - 5;
     rl->net_width = origin_width;
     rl->net_height = origin_height;
     rl->layer_width = width;
     rl->layer_height = height;
     rl->boxes_number = (rl->layer_width * rl->layer_height * rl->anchor_number);
     rl->output_number = (rl->boxes_number * (rl->classes + rl->coords + 1));
     rl->output = malloc(rl->output_number * sizeof(float));
     if (rl->output == NULL)
+    {
         flag = -1;
         goto malloc_error;
+    }
     rl->boxes = malloc(rl->boxes_number * sizeof(box_t));
     if (rl->boxes == NULL)
+    {
         flag = -2;
         goto malloc_error;
+    }
     rl->probs_buf = malloc(rl->boxes_number * (rl->classes + 1) * sizeof(float));
     if (rl->probs_buf == NULL)
+    {
         flag = -3;
         goto malloc_error;
+    }
     rl->probs = malloc(rl->boxes_number * sizeof(float *));
     if (rl->probs == NULL)
+    {
         flag = -4;
         goto malloc_error;
+    }
     for (uint32_t i = 0; i < rl->boxes_number; i++)
         rl->probs[i] = &(rl->probs_buf[i * (rl->classes + 1)]);
     return 0;
+        int flag = 0;
+        rl->coords = 4;
+        rl->image_width = 320;
+        rl->image_height = 240;
+        rl->classes = channels / 5 - 5;
+        rl->net_width = origin_width;
+        rl->net_height = origin_height;
+        rl->layer_width = width;
+        rl->layer_height = height;
+        rl->boxes_number = (rl->layer_width * rl->layer_height * rl->anchor_number);
+        rl->output_number = (rl->boxes_number * (rl->classes + rl->coords + 1));
+        rl->output = malloc(rl->output_number * sizeof(float));
+        if (rl->output == NULL)
+        {
+                flag = -1;
+                goto malloc_error;
+        }
+        rl->boxes = malloc(rl->boxes_number * sizeof(box_t));
+        if (rl->boxes == NULL)
+        {
+                flag = -2;
+                goto malloc_error;
+        }
+        rl->probs_buf = malloc(rl->boxes_number * (rl->classes + 1) * sizeof(float));
+        if (rl->probs_buf == NULL)
+        {
+                flag = -3;
+                goto malloc_error;
+        }
+        rl->probs = malloc(rl->boxes_number * sizeof(float *));
+        if (rl->probs == NULL)
+        {
+                flag = -4;
+                goto malloc_error;
+        }
+        for (uint32_t i = 0; i < rl->boxes_number; i++)
+                rl->probs[i] = &(rl->probs_buf[i * (rl->classes + 1)]);
+        return 0;
 malloc_error:
     free(rl->output);
     free(rl->boxes);
     free(rl->probs_buf);
     free(rl->probs);
     return flag;
+        free(rl->output);
+        free(rl->boxes);
+        free(rl->probs_buf);
+        free(rl->probs);
+        return flag;
+}
 void region_layer_deinit(region_layer_t *rl)
+{
     free(rl->output);
     free(rl->boxes);
     free(rl->probs_buf);
     free(rl->probs);
+        free(rl->output);
+        free(rl->boxes);
+        free(rl->probs_buf);
+        free(rl->probs);
+}
 static inline float sigmoid(float x)
+{
     return 1.f / (1.f + expf(-x));
+        return 1.f / (1.f + expf(-x));
+}
 static void activate_array(region_layer_t *rl, int index, int n)
+{
     float *output = &rl->output[index];
     float *input = &rl->input[index];
     for (int i = 0; i < n; ++i)
         output[i] = sigmoid(input[i]);
+        float *output = &rl->output[index];
+        float *input = &rl->input[index];
+        for (int i = 0; i < n; ++i)
+                output[i] = sigmoid(input[i]);
+}
 static int entry_index(region_layer_t *rl, int location, int entry)
+{
     int wh = rl->layer_width * rl->layer_height;
     int n   = location / wh;
     int loc = location % wh;
     return n * wh * (rl->coords + rl->classes + 1) + entry * wh + loc;
+        int wh = rl->layer_width * rl->layer_height;
+        int n   = location / wh;
+        int loc = location % wh;
+        return n * wh * (rl->coords + rl->classes + 1) + entry * wh + loc;
+}
 static void softmax(region_layer_t *rl, float *input, int n, int stride, float *output)
+{
     int i;
     float diff;
     float e;
     float sum = 0;
     float largest_i = input[0];
     for (i = 0; i < n; ++i)
+    {
         if (input[i * stride] > largest_i)
             largest_i = input[i * stride];
+    }
     for (i = 0; i < n; ++i) {
         diff = input[i * stride] - largest_i;
         e = expf(diff);
         sum += e;
         output[i * stride] = e;
+    }
     for (i = 0; i < n; ++i)
         output[i * stride] /= sum;
+        int i;
+        float diff;
+        float e;
+        float sum = 0;
+        float largest_i = input[0];
+        for (i = 0; i < n; ++i)
+        {
+                if (input[i * stride] > largest_i)
+                        largest_i = input[i * stride];
+        }
+        for (i = 0; i < n; ++i) {
+                diff = input[i * stride] - largest_i;
+                e = expf(diff);
+                sum += e;
+                output[i * stride] = e;
+        }
+        for (i = 0; i < n; ++i)
+                output[i * stride] /= sum;
+}
 static void softmax_cpu(region_layer_t *rl, float *input, int n, int batch, int batch_offset, int groups, int stride, float *output)
+{
     int g, b;
     for (b = 0; b < batch; ++b) {
         for (g = 0; g < groups; ++g)
             softmax(rl, input + b * batch_offset + g, n, stride, output + b * batch_offset + g);
+    }
+        int g, b;
+        for (b = 0; b < batch; ++b) {
+                for (g = 0; g < groups; ++g)
+                        softmax(rl, input + b * batch_offset + g, n, stride, output + b * batch_offset + g);
+        }
+}
 static void forward_region_layer(region_layer_t *rl)
+{
     int index;
     for (index = 0; index < rl->output_number; index++)
         rl->output[index] = rl->input[index];
     for (int n = 0; n < rl->anchor_number; ++n)
+    {
         index = entry_index(rl, n * rl->layer_width * rl->layer_height, 0);
         activate_array(rl, index, 2 * rl->layer_width * rl->layer_height);
         index = entry_index(rl, n * rl->layer_width * rl->layer_height, 4);
         activate_array(rl, index, rl->layer_width * rl->layer_height);
+    }
     index = entry_index(rl, 0, rl->coords + 1);
     softmax_cpu(rl, rl->input + index, rl->classes, rl->anchor_number,
             rl->output_number / rl->anchor_number, rl->layer_width * rl->layer_height,
             rl->layer_width * rl->layer_height, rl->output + index);
+        int index;
+        for (index = 0; index < rl->output_number; index++)
+                rl->output[index] = rl->input[index];
+        for (int n = 0; n < rl->anchor_number; ++n)
+        {
+                index = entry_index(rl, n * rl->layer_width * rl->layer_height, 0);
+                activate_array(rl, index, 2 * rl->layer_width * rl->layer_height);
+                index = entry_index(rl, n * rl->layer_width * rl->layer_height, 4);
+                activate_array(rl, index, rl->layer_width * rl->layer_height);
+        }
+        index = entry_index(rl, 0, rl->coords + 1);
+        softmax_cpu(rl, rl->input + index, rl->classes, rl->anchor_number,
+                        rl->output_number / rl->anchor_number, rl->layer_width * rl->layer_height,
+                        rl->layer_width * rl->layer_height, rl->output + index);
+}
 static void correct_region_boxes(region_layer_t *rl, box_t *boxes)
+{
     uint32_t net_width = rl->net_width;
     uint32_t net_height = rl->net_height;
     uint32_t image_width = rl->image_width;
     uint32_t image_height = rl->image_height;
     uint32_t boxes_number = rl->boxes_number;
     int new_w = 0;
     int new_h = 0;
     if (((float)net_width / image_width) <
         ((float)net_height / image_height)) {
         new_w = net_width;
         new_h = (image_height * net_width) / image_width;
     } else {
         new_h = net_height;
         new_w = (image_width * net_height) / image_height;
+    }
     for (int i = 0; i < boxes_number; ++i) {
         box_t b = boxes[i];
         b.x = (b.x - (net_width - new_w) / 2. / net_width) /
               ((float)new_w / net_width);
         b.y = (b.y - (net_height - new_h) / 2. / net_height) /
               ((float)new_h / net_height);
         b.w *= (float)net_width / new_w;
         b.h *= (float)net_height / new_h;
         boxes[i] = b;
+    }
+        uint32_t net_width = rl->net_width;
+        uint32_t net_height = rl->net_height;
+        uint32_t image_width = rl->image_width;
+        uint32_t image_height = rl->image_height;
+        uint32_t boxes_number = rl->boxes_number;
+        int new_w = 0;
+        int new_h = 0;
+        if (((float)net_width / image_width) <
+                ((float)net_height / image_height)) {
+                new_w = net_width;
+                new_h = (image_height * net_width) / image_width;
+        } else {
+                new_h = net_height;
+                new_w = (image_width * net_height) / image_height;
+        }
+        for (int i = 0; i < boxes_number; ++i) {
+                box_t b = boxes[i];
+                b.x = (b.x - (net_width - new_w) / 2. / net_width) /
+                          ((float)new_w / net_width);
+                b.y = (b.y - (net_height - new_h) / 2. / net_height) /
+                          ((float)new_h / net_height);
+                b.w *= (float)net_width / new_w;
+                b.h *= (float)net_height / new_h;
+                boxes[i] = b;
+        }
+}
 static box_t get_region_box(float *x, float *biases, int n, int index, int i, int j, int w, int h, int stride)
+{
     volatile box_t b;
     b.x = (i + x[index + 0 * stride]) / w;
     b.y = (j + x[index + 1 * stride]) / h;
     b.w = expf(x[index + 2 * stride]) * biases[2 * n] / w;
     b.h = expf(x[index + 3 * stride]) * biases[2 * n + 1] / h;
     return b;
+        volatile box_t b;
+        b.x = (i + x[index + 0 * stride]) / w;
+        b.y = (j + x[index + 1 * stride]) / h;
+        b.w = expf(x[index + 2 * stride]) * biases[2 * n] / w;
+        b.h = expf(x[index + 3 * stride]) * biases[2 * n + 1] / h;
+        return b;
+}
 static void get_region_boxes(region_layer_t *rl, float *predictions, float **probs, box_t *boxes)
+{
     uint32_t layer_width = rl->layer_width;
     uint32_t layer_height = rl->layer_height;
     uint32_t anchor_number = rl->anchor_number;
     uint32_t classes = rl->classes;
     uint32_t coords = rl->coords;
     float threshold = rl->threshold;
     for (int i = 0; i < layer_width * layer_height; ++i)
+    {
         int row = i / layer_width;
         int col = i % layer_width;
         for (int n = 0; n < anchor_number; ++n)
+        {
             int index = n * layer_width * layer_height + i;
             for (int j = 0; j < classes; ++j)
                 probs[index][j] = 0;
             int obj_index = entry_index(rl, n * layer_width * layer_height + i, coords);
             int box_index = entry_index(rl, n * layer_width * layer_height + i, 0);
             float scale  = predictions[obj_index];
             boxes[index] = get_region_box(predictions, rl->anchor, n, box_index, col, row,
                 layer_width, layer_height, layer_width * layer_height);
             float max = 0;
             for (int j = 0; j < classes; ++j)
+            {
                 int class_index = entry_index(rl, n * layer_width * layer_height + i, coords + 1 + j);
                 float prob = scale * predictions[class_index];
                 probs[index][j] = (prob > threshold) ? prob : 0;
                 if (prob > max)
                     max = prob;
+            }
             probs[index][classes] = max;
+        }
+    }
     correct_region_boxes(rl, boxes);
+        uint32_t layer_width = rl->layer_width;
+        uint32_t layer_height = rl->layer_height;
+        uint32_t anchor_number = rl->anchor_number;
+        uint32_t classes = rl->classes;
+        uint32_t coords = rl->coords;
+        float threshold = rl->threshold;
+        for (int i = 0; i < layer_width * layer_height; ++i)
+        {
+                int row = i / layer_width;
+                int col = i % layer_width;
+                for (int n = 0; n < anchor_number; ++n)
+                {
+                        int index = n * layer_width * layer_height + i;
+                        for (int j = 0; j < classes; ++j)
+                                probs[index][j] = 0;
+                        int obj_index = entry_index(rl, n * layer_width * layer_height + i, coords);
+                        int box_index = entry_index(rl, n * layer_width * layer_height + i, 0);
+                        float scale  = predictions[obj_index];
+                        boxes[index] = get_region_box(predictions, rl->anchor, n, box_index, col, row,
+                                layer_width, layer_height, layer_width * layer_height);
+                        float max = 0;
+                        for (int j = 0; j < classes; ++j)
+                        {
+                                int class_index = entry_index(rl, n * layer_width * layer_height + i, coords + 1 + j);
+                                float prob = scale * predictions[class_index];
+                                probs[index][j] = (prob > threshold) ? prob : 0;
+                                if (prob > max)
+                                        max = prob;
+                        }
+                        probs[index][classes] = max;
+                }
+        }
+        correct_region_boxes(rl, boxes);
+}
 static int nms_comparator(void *pa, void *pb)
+{
     sortable_box_t a = *(sortable_box_t *)pa;
     sortable_box_t b = *(sortable_box_t *)pb;
     float diff = a.probs[a.index][b.class] - b.probs[b.index][b.class];
     if (diff < 0)
         return 1;
     else if (diff > 0)
         return -1;
     return 0;
+        sortable_box_t a = *(sortable_box_t *)pa;
+        sortable_box_t b = *(sortable_box_t *)pb;
+        float diff = a.probs[a.index][b.class] - b.probs[b.index][b.class];
+        if (diff < 0)
+                return 1;
+        else if (diff > 0)
+                return -1;
+        return 0;
+}
 static float overlap(float x1, float w1, float x2, float w2)
+{
     float l1 = x1 - w1/2;
     float l2 = x2 - w2/2;
     float left = l1 > l2 ? l1 : l2;
     float r1 = x1 + w1/2;
     float r2 = x2 + w2/2;
     float right = r1 < r2 ? r1 : r2;
     return right - left;
+        float l1 = x1 - w1/2;
+        float l2 = x2 - w2/2;
+        float left = l1 > l2 ? l1 : l2;
+        float r1 = x1 + w1/2;
+        float r2 = x2 + w2/2;
+        float right = r1 < r2 ? r1 : r2;
+        return right - left;
+}
 static float box_intersection(box_t a, box_t b)
+{
     float w = overlap(a.x, a.w, b.x, b.w);
     float h = overlap(a.y, a.h, b.y, b.h);
     if (w < 0 || h < 0)
         return 0;
     return w * h;
+        float w = overlap(a.x, a.w, b.x, b.w);
+        float h = overlap(a.y, a.h, b.y, b.h);
+        if (w < 0 || h < 0)
+                return 0;
+        return w * h;
+}
 static float box_union(box_t a, box_t b)
+{
     float i = box_intersection(a, b);
     float u = a.w * a.h + b.w * b.h - i;
     return u;
+        float i = box_intersection(a, b);
+        float u = a.w * a.h + b.w * b.h - i;
+        return u;
+}
 static float box_iou(box_t a, box_t b)
+{
     return box_intersection(a, b) / box_union(a, b);
+        return box_intersection(a, b) / box_union(a, b);
+}
 static void do_nms_sort(region_layer_t *rl, box_t *boxes, float **probs)
+{
     uint32_t boxes_number = rl->boxes_number;
     uint32_t classes = rl->classes;
     float nms_value = rl->nms_value;
     int i, j, k;
     sortable_box_t s[boxes_number];
     for (i = 0; i < boxes_number; ++i)
+    {
         s[i].index = i;
         s[i].class = 0;
         s[i].probs = probs;
+    }
     for (k = 0; k < classes; ++k)
+    {
         for (i = 0; i < boxes_number; ++i)
             s[i].class = k;
         qsort(s, boxes_number, sizeof(sortable_box_t), nms_comparator);
         for (i = 0; i < boxes_number; ++i)
+        {
             if (probs[s[i].index][k] == 0)
                 continue;
             box_t a = boxes[s[i].index];
             for (j = i + 1; j < boxes_number; ++j)
+            {
                 box_t b = boxes[s[j].index];
                 if (box_iou(a, b) > nms_value)
                     probs[s[j].index][k] = 0;
+            }
+        }
+    }
+        uint32_t boxes_number = rl->boxes_number;
+        uint32_t classes = rl->classes;
+        float nms_value = rl->nms_value;
+        int i, j, k;
+        sortable_box_t s[boxes_number];
+        for (i = 0; i < boxes_number; ++i)
+        {
+                s[i].index = i;
+                s[i].class = 0;
+                s[i].probs = probs;
+        }
+        for (k = 0; k < classes; ++k)
+        {
+                for (i = 0; i < boxes_number; ++i)
+                        s[i].class = k;
+                qsort(s, boxes_number, sizeof(sortable_box_t), nms_comparator);
+                for (i = 0; i < boxes_number; ++i)
+                {
+                        if (probs[s[i].index][k] == 0)
+                                continue;
+                        box_t a = boxes[s[i].index];
+                        for (j = i + 1; j < boxes_number; ++j)
+                        {
+                                box_t b = boxes[s[j].index];
+                                if (box_iou(a, b) > nms_value)
+                                        probs[s[j].index][k] = 0;
+                        }
+                }
+        }
+}
 static int max_index(float *a, int n)
+{
     int i, max_i = 0;
     float max = a[0];
     for (i = 1; i < n; ++i)
+    {
         if (a[i] > max)
+        {
             max   = a[i];
             max_i = i;
+        }
+    }
     return max_i;
+        int i, max_i = 0;
+        float max = a[0];
+        for (i = 1; i < n; ++i)
+        {
+                if (a[i] > max)
+                {
+                        max   = a[i];
+                        max_i = i;
+                }
+        }
+        return max_i;
+}
 static void region_layer_output(region_layer_t *rl, obj_info_t *obj_info)
+{
     uint32_t obj_number = 0;
     uint32_t image_width = rl->image_width;
     uint32_t image_height = rl->image_height;
     uint32_t boxes_number = rl->boxes_number;
     float threshold = rl->threshold;
     box_t *boxes = (box_t *)rl->boxes;
     for (int i = 0; i < rl->boxes_number; ++i)
+    {
         int class  = max_index(rl->probs[i], rl->classes);
         float prob = rl->probs[i][class];
         if (prob > threshold)
+        {
             box_t *b = boxes + i;
             obj_info->obj[obj_number].x1 = b->x * image_width - (b->w * image_width / 2);
             obj_info->obj[obj_number].y1 = b->y * image_height - (b->h * image_height / 2);
             obj_info->obj[obj_number].x2 = b->x * image_width + (b->w * image_width / 2);
             obj_info->obj[obj_number].y2 = b->y * image_height + (b->h * image_height / 2);
             obj_info->obj[obj_number].class_id = class;
             obj_info->obj[obj_number].prob = prob;
             obj_number++;
+        }
+    }
     obj_info->obj_number = obj_number;
+        uint32_t obj_number = 0;
+        uint32_t image_width = rl->image_width;
+        uint32_t image_height = rl->image_height;
+        uint32_t boxes_number = rl->boxes_number;
+        float threshold = rl->threshold;
+        box_t *boxes = (box_t *)rl->boxes;
+        for (int i = 0; i < rl->boxes_number; ++i)
+        {
+                int class  = max_index(rl->probs[i], rl->classes);
+                float prob = rl->probs[i][class];
+                if (prob > threshold)
+                {
+                        box_t *b = boxes + i;
+                        obj_info->obj[obj_number].x1 = b->x * image_width - (b->w * image_width / 2);
+                        obj_info->obj[obj_number].y1 = b->y * image_height - (b->h * image_height / 2);
+                        obj_info->obj[obj_number].x2 = b->x * image_width + (b->w * image_width / 2);
+                        obj_info->obj[obj_number].y2 = b->y * image_height + (b->h * image_height / 2);
+                        obj_info->obj[obj_number].class_id = class;
+                        obj_info->obj[obj_number].prob = prob;
+                        obj_number++;
+                }
+        }
+        obj_info->obj_number = obj_number;
+}
 void region_layer_run(region_layer_t *rl, obj_info_t *obj_info)
+{
     forward_region_layer(rl);
     get_region_boxes(rl, rl->output, rl->probs, rl->boxes);
     do_nms_sort(rl, rl->boxes, rl->probs);
     // region_layer_output(rl, obj_info);
+        forward_region_layer(rl);
+        get_region_boxes(rl, rl->output, rl->probs, rl->boxes);
+        do_nms_sort(rl, rl->boxes, rl->probs);
+        // region_layer_output(rl, obj_info);
+}
 void region_layer_draw_boxes(region_layer_t *rl, callback_draw_box callback)
+{
     uint32_t image_width = rl->image_width;
     uint32_t image_height = rl->image_height;
     float threshold = rl->threshold;
     box_t *boxes = (box_t *)rl->boxes;
     for (int i = 0; i < rl->boxes_number; ++i)
+    {
         int class  = max_index(rl->probs[i], rl->classes);
         float prob = rl->probs[i][class];
         if (prob > threshold)
+        {
             box_t *b = boxes + i;
             uint32_t x1 = b->x * image_width - (b->w * image_width / 2);
             uint32_t y1 = b->y * image_height - (b->h * image_height / 2);
             uint32_t x2 = b->x * image_width + (b->w * image_width / 2);
             uint32_t y2 = b->y * image_height + (b->h * image_height / 2);
             callback(x1, y1, x2, y2, class, prob);
+        }
+    }
+}
+        uint32_t image_width = rl->image_width;
+        uint32_t image_height = rl->image_height;
+        float threshold = rl->threshold;
+        box_t *boxes = (box_t *)rl->boxes;
+        for (int i = 0; i < rl->boxes_number; ++i)
+        {
+                int class  = max_index(rl->probs[i], rl->classes);
+                float prob = rl->probs[i][class];
+                if (prob > threshold)
+                {
+                        box_t *b = boxes + i;
+                        uint32_t x1 = b->x * image_width - (b->w * image_width / 2);
+                        uint32_t y1 = b->y * image_height - (b->h * image_height / 2);
+                        uint32_t x2 = b->x * image_width + (b->w * image_width / 2);
+                        uint32_t y2 = b->y * image_height + (b->h * image_height / 2);
+                        callback(x1, y1, x2, y2, class, prob);
+                }
+        }
+}

azure_iot_hub_riscv/trunk/app_iothub_client/src/command.c

-              r453
+              r458
 void
 digitalWrite(uint8_t Pin, int dwVal){
     int8_t gpio_pin = gpio_get_gpiohno(Pin, false);
+        int8_t gpio_pin = gpio_get_gpiohno(Pin, false);
     if( gpio_pin >= 0){
         gpio_set_pin(TADR_GPIOHS_BASE, (uint8_t)gpio_pin, dwVal);
+    }
+        if( gpio_pin >= 0){
+                gpio_set_pin(TADR_GPIOHS_BASE, (uint8_t)gpio_pin, dwVal);
+        }
+}

azure_iot_hub_riscv/trunk/app_iothub_client/src/envcmd.c

-              r453
+              r458
 /*
+*  TOPPERS/ASP Kernel
+*      Toyohashi Open Platform for Embedded Real-Time Systems/
+*      Advanced Standard Profile Kernel
+*
+*  Copyright (C) 2000-2003 by Embedded and Real-Time Systems Laboratory
+*                              Toyohashi Univ. of Technology, JAPAN
+*  Copyright (C) 2004-2012 by Embedded and Real-Time Systems Laboratory
+*              Graduate School of Information Science, Nagoya Univ., JAPAN
+*
+*  ãLì ÒÍCÈºÌ(1)`(4)Ìðð½·êÉÀèC{\tgEF
+*  Ai{\tgEFAðüÏµ½àÌðÜÞDÈº¯¶jðgpE¡»Eü
+*  ÏEÄzziÈºCpÆÄÔj·é±Æð³Åø·éD
+*  (1) {\tgEFAð\[XR[hÌ`Åp·éêÉÍCãLÌì
+*       \¦C±Ìpð¨æÑºLÌ³ÛØKèªC»ÌÜÜÌ`Å\[
+*      XR[hÉÜÜêÄ¢é±ÆD
+*  (2) {\tgEFAðCCu`®ÈÇC¼Ì\tgEFAJÉg
+*      pÅ«é`ÅÄzz·éêÉÍCÄzzÉº¤hL
+gip
+*      Ò}j
+AÈÇjÉCãLÌì \¦C±Ìpð¨æÑºL
+*      Ì³ÛØKèðfÚ·é±ÆD
+*  (3) {\tgEFAðC@íÉgÝÞÈÇC¼Ì\tgEFAJÉg
+*      pÅ«È¢`ÅÄzz·éêÉÍCÌ¢¸ê©Ìðð½·±
+*      ÆD
+*    (a) ÄzzÉº¤hL
+gipÒ}j
+AÈÇjÉCãLÌ
+*        ì \¦C±Ìpð¨æÑºLÌ³ÛØKèðfÚ·é±ÆD
+*    (b) ÄzzÌ`ÔðCÊÉèßéû@ÉæÁÄCTOPPERSvWFNgÉ
+*        ñ·é±ÆD
+*  (4) {\tgEFAÌpÉæè¼ÚIÜ½ÍÔÚIÉ¶¶é¢©Èé¹
+*      Q©çàCãLì Ò¨æÑTOPPERSvWFNgðÆÓ·é±ÆD
+*      Ü½C{\tgEFAÌ[UÜ½ÍGh[U©çÌ¢©Èé
+*      RÉîÃ¿©çàCãLì Ò¨æÑTOPPERSvWFNgð
+*      ÆÓ·é±ÆD
+*
+*  {\tgEFAÍC³ÛØÅñ³êÄ¢éàÌÅ éDãLì Ò¨
+*  æÑTOPPERSvWFNgÍC{\tgEFAÉÖµÄCÁèÌgpÚI
+*  ÉÎ·éK«àÜßÄC¢©ÈéÛØàsíÈ¢DÜ½C{\tgEF
+*  AÌpÉæè¼ÚIÜ½ÍÔÚIÉ¶¶½¢©Èé¹QÉÖµÄàC»
+*  ÌÓCðíÈ¢D
+*
+*  $Id$
+*/
+ *  TOPPERS/ASP Kernel
+ *      Toyohashi Open Platform for Embedded Real-Time Systems/
+ *      Advanced Standard Profile Kernel
+ *
+ *  Copyright (C) 2000-2003 by Embedded and Real-Time Systems Laboratory
+ *                              Toyohashi Univ. of Technology, JAPAN
+ *  Copyright (C) 2004-2012 by Embedded and Real-Time Systems Laboratory
+ *              Graduate School of Information Science, Nagoya Univ., JAPAN
+ *
+ *  上記著作権者は，以下の(1)～(4)の条件を満たす場合に限り，本ソフトウェ
+ *  ア（本ソフトウェアを改変したものを含む．以下同じ）を使用・複製・改
+ *  変・再配布（以下，利用と呼ぶ）することを無償で許諾する．
+ *  (1) 本ソフトウェアをソースコードの形で利用する場合には，上記の著作
+ *      権表示，この利用条件および下記の無保証規定が，そのままの形でソー
+ *      スコード中に含まれていること．
+ *  (2) 本ソフトウェアを，ライブラリ形式など，他のソフトウェア開発に使
+ *      用できる形で再配布する場合には，再配布に伴うドキュメント（利用
+ *      者マニュアルなど）に，上記の著作権表示，この利用条件および下記
+ *      の無保証規定を掲載すること．
+ *  (3) 本ソフトウェアを，機器に組み込むなど，他のソフトウェア開発に使
+ *      用できない形で再配布する場合には，次のいずれかの条件を満たすこ
+ *      と．
+ *    (a) 再配布に伴うドキュメント（利用者マニュアルなど）に，上記の著
+ *        作権表示，この利用条件および下記の無保証規定を掲載すること．
+ *    (b) 再配布の形態を，別に定める方法によって，TOPPERSプロジェクトに
+ *        報告すること．
+ *  (4) 本ソフトウェアの利用により直接的または間接的に生じるいかなる損
+ *      害からも，上記著作権者およびTOPPERSプロジェクトを免責すること．
+ *      また，本ソフトウェアのユーザまたはエンドユーザからのいかなる理
+ *      由に基づく請求からも，上記著作権者およびTOPPERSプロジェクトを
+ *      免責すること．
+ *
+ *  本ソフトウェアは，無保証で提供されているものである．上記著作権者お
+ *  よびTOPPERSプロジェクトは，本ソフトウェアに関して，特定の使用目的
+ *  に対する適合性も含めて，いかなる保証も行わない．また，本ソフトウェ
+ *  アの利用により直接的または間接的に生じたいかなる損害に関しても，そ
+ *  の責任を負わない．
+ *
+ *  $Id$
+ */
 #include <stdio.h>
 #include <stdlib.h>
 …
+}
 int set_cs_main(int argc, char **argv)
+{
 …
         return 0;
+}
 int clear_proxy_main(int argc, char **argv)
+{

azure_iot_hub_riscv/trunk/app_iothub_client/src/esp_at_socket.c

-              r453
+              r458
 /*
 *  TOPPERS/ASP Kernel
 *      Toyohashi Open Platform for Embedded Real-Time Systems/
 *      Advanced Standard Profile Kernel
+*
 *  Copyright (C) 2000-2003 by Embedded and Real-Time Systems Laboratory
 *                              Toyohashi Univ. of Technology, JAPAN
 *  Copyright (C) 2004-2012 by Embedded and Real-Time Systems Laboratory
 *              Graduate School of Information Science, Nagoya Univ., JAPAN
+*
 *  上記著作権者は，以下の(1)～(4)の条件を満たす場合に限り，本ソフトウェ
 *  ア（本ソフトウェアを改変したものを含む．以下同じ）を使用・複製・改
 *  変・再配布（以下，利用と呼ぶ）することを無償で許諾する．
 *  (1) 本ソフトウェアをソースコードの形で利用する場合には，上記の著作
 *      権表示，この利用条件および下記の無保証規定が，そのままの形でソー
 *      スコード中に含まれていること．
 *  (2) 本ソフトウェアを，ライブラリ形式など，他のソフトウェア開発に使
 *      用できる形で再配布する場合には，再配布に伴うドキュメント（利用
 *      者マニュアルなど）に，上記の著作権表示，この利用条件および下記
 *      の無保証規定を掲載すること．
 *  (3) 本ソフトウェアを，機器に組み込むなど，他のソフトウェア開発に使
 *      用できない形で再配布する場合には，次のいずれかの条件を満たすこ
 *      と．
 *    (a) 再配布に伴うドキュメント（利用者マニュアルなど）に，上記の著
 *        作権表示，この利用条件および下記の無保証規定を掲載すること．
 *    (b) 再配布の形態を，別に定める方法によって，TOPPERSプロジェクトに
 *        報告すること．
 *  (4) 本ソフトウェアの利用により直接的または間接的に生じるいかなる損
 *      害からも，上記著作権者およびTOPPERSプロジェクトを免責すること．
 *      また，本ソフトウェアのユーザまたはエンドユーザからのいかなる理
 *      由に基づく請求からも，上記著作権者およびTOPPERSプロジェクトを
 *      免責すること．
+*
 *  本ソフトウェアは，無保証で提供されているものである．上記著作権者お
 *  よびTOPPERSプロジェクトは，本ソフトウェアに関して，特定の使用目的
 *  に対する適合性も含めて，いかなる保証も行わない．また，本ソフトウェ
 *  アの利用により直接的または間接的に生じたいかなる損害に関しても，そ
 *  の責任を負わない．
+*
 *  $Id$
 */
+ *  TOPPERS/ASP Kernel
+ *      Toyohashi Open Platform for Embedded Real-Time Systems/
+ *      Advanced Standard Profile Kernel
+ *
+ *  Copyright (C) 2000-2003 by Embedded and Real-Time Systems Laboratory
+ *                              Toyohashi Univ. of Technology, JAPAN
+ *  Copyright (C) 2004-2012 by Embedded and Real-Time Systems Laboratory
+ *              Graduate School of Information Science, Nagoya Univ., JAPAN
+ *
+ *  上記著作権者は，以下の(1)～(4)の条件を満たす場合に限り，本ソフトウェ
+ *  ア（本ソフトウェアを改変したものを含む．以下同じ）を使用・複製・改
+ *  変・再配布（以下，利用と呼ぶ）することを無償で許諾する．
+ *  (1) 本ソフトウェアをソースコードの形で利用する場合には，上記の著作
+ *      権表示，この利用条件および下記の無保証規定が，そのままの形でソー
+ *      スコード中に含まれていること．
+ *  (2) 本ソフトウェアを，ライブラリ形式など，他のソフトウェア開発に使
+ *      用できる形で再配布する場合には，再配布に伴うドキュメント（利用
+ *      者マニュアルなど）に，上記の著作権表示，この利用条件および下記
+ *      の無保証規定を掲載すること．
+ *  (3) 本ソフトウェアを，機器に組み込むなど，他のソフトウェア開発に使
+ *      用できない形で再配布する場合には，次のいずれかの条件を満たすこ
+ *      と．
+ *    (a) 再配布に伴うドキュメント（利用者マニュアルなど）に，上記の著
+ *        作権表示，この利用条件および下記の無保証規定を掲載すること．
+ *    (b) 再配布の形態を，別に定める方法によって，TOPPERSプロジェクトに
+ *        報告すること．
+ *  (4) 本ソフトウェアの利用により直接的または間接的に生じるいかなる損
+ *      害からも，上記著作権者およびTOPPERSプロジェクトを免責すること．
+ *      また，本ソフトウェアのユーザまたはエンドユーザからのいかなる理
+ *      由に基づく請求からも，上記著作権者およびTOPPERSプロジェクトを
+ *      免責すること．
+ *
+ *  本ソフトウェアは，無保証で提供されているものである．上記著作権者お
+ *  よびTOPPERSプロジェクトは，本ソフトウェアに関して，特定の使用目的
+ *  に対する適合性も含めて，いかなる保証も行わない．また，本ソフトウェ
+ *  アの利用により直接的または間接的に生じたいかなる損害に関しても，そ
+ *  の責任を負わない．
+ *
+ *  $Id$
+ */
 #include <stddef.h>
 #include <stdbool.h>
 …
+{
         struct tm tm = {
 ,  /* tm_sec */
 ,  /* tm_min */
 ,  /* tm_hour */
 ,  /* tm_mday */
 ,  /* tm_mon */
 - 1900,  /* tm_year */
+,  /* tm_sec */
+,  /* tm_min */
+,  /* tm_hour */
+,  /* tm_mday */
+,  /* tm_mon */
+- 1900,  /* tm_year */
         };
         MINIMUM_YEAR = mktime(&tm);

azure_iot_hub_riscv/trunk/app_iothub_client/src/esp_at_socket.h

-              r453
+              r458
 /*
 *  TOPPERS/ASP Kernel
 *      Toyohashi Open Platform for Embedded Real-Time Systems/
 *      Advanced Standard Profile Kernel
+*
 *  Copyright (C) 2000-2003 by Embedded and Real-Time Systems Laboratory
 *                              Toyohashi Univ. of Technology, JAPAN
 *  Copyright (C) 2004-2012 by Embedded and Real-Time Systems Laboratory
 *              Graduate School of Information Science, Nagoya Univ., JAPAN
+*
 *  上記著作権者は，以下の(1)～(4)の条件を満たす場合に限り，本ソフトウェ
 *  ア（本ソフトウェアを改変したものを含む．以下同じ）を使用・複製・改
 *  変・再配布（以下，利用と呼ぶ）することを無償で許諾する．
 *  (1) 本ソフトウェアをソースコードの形で利用する場合には，上記の著作
 *      権表示，この利用条件および下記の無保証規定が，そのままの形でソー
 *      スコード中に含まれていること．
 *  (2) 本ソフトウェアを，ライブラリ形式など，他のソフトウェア開発に使
 *      用できる形で再配布する場合には，再配布に伴うドキュメント（利用
 *      者マニュアルなど）に，上記の著作権表示，この利用条件および下記
 *      の無保証規定を掲載すること．
 *  (3) 本ソフトウェアを，機器に組み込むなど，他のソフトウェア開発に使
 *      用できない形で再配布する場合には，次のいずれかの条件を満たすこ
 *      と．
 *    (a) 再配布に伴うドキュメント（利用者マニュアルなど）に，上記の著
 *        作権表示，この利用条件および下記の無保証規定を掲載すること．
 *    (b) 再配布の形態を，別に定める方法によって，TOPPERSプロジェクトに
 *        報告すること．
 *  (4) 本ソフトウェアの利用により直接的または間接的に生じるいかなる損
 *      害からも，上記著作権者およびTOPPERSプロジェクトを免責すること．
 *      また，本ソフトウェアのユーザまたはエンドユーザからのいかなる理
 *      由に基づく請求からも，上記著作権者およびTOPPERSプロジェクトを
 *      免責すること．
+*
 *  本ソフトウェアは，無保証で提供されているものである．上記著作権者お
 *  よびTOPPERSプロジェクトは，本ソフトウェアに関して，特定の使用目的
 *  に対する適合性も含めて，いかなる保証も行わない．また，本ソフトウェ
 *  アの利用により直接的または間接的に生じたいかなる損害に関しても，そ
 *  の責任を負わない．
+*
 *  $Id$
 */
+ *  TOPPERS/ASP Kernel
+ *      Toyohashi Open Platform for Embedded Real-Time Systems/
+ *      Advanced Standard Profile Kernel
+ *
+ *  Copyright (C) 2000-2003 by Embedded and Real-Time Systems Laboratory
+ *                              Toyohashi Univ. of Technology, JAPAN
+ *  Copyright (C) 2004-2012 by Embedded and Real-Time Systems Laboratory
+ *              Graduate School of Information Science, Nagoya Univ., JAPAN
+ *
+ *  上記著作権者は，以下の(1)～(4)の条件を満たす場合に限り，本ソフトウェ
+ *  ア（本ソフトウェアを改変したものを含む．以下同じ）を使用・複製・改
+ *  変・再配布（以下，利用と呼ぶ）することを無償で許諾する．
+ *  (1) 本ソフトウェアをソースコードの形で利用する場合には，上記の著作
+ *      権表示，この利用条件および下記の無保証規定が，そのままの形でソー
+ *      スコード中に含まれていること．
+ *  (2) 本ソフトウェアを，ライブラリ形式など，他のソフトウェア開発に使
+ *      用できる形で再配布する場合には，再配布に伴うドキュメント（利用
+ *      者マニュアルなど）に，上記の著作権表示，この利用条件および下記
+ *      の無保証規定を掲載すること．
+ *  (3) 本ソフトウェアを，機器に組み込むなど，他のソフトウェア開発に使
+ *      用できない形で再配布する場合には，次のいずれかの条件を満たすこ
+ *      と．
+ *    (a) 再配布に伴うドキュメント（利用者マニュアルなど）に，上記の著
+ *        作権表示，この利用条件および下記の無保証規定を掲載すること．
+ *    (b) 再配布の形態を，別に定める方法によって，TOPPERSプロジェクトに
+ *        報告すること．
+ *  (4) 本ソフトウェアの利用により直接的または間接的に生じるいかなる損
+ *      害からも，上記著作権者およびTOPPERSプロジェクトを免責すること．
+ *      また，本ソフトウェアのユーザまたはエンドユーザからのいかなる理
+ *      由に基づく請求からも，上記著作権者およびTOPPERSプロジェクトを
+ *      免責すること．
+ *
+ *  本ソフトウェアは，無保証で提供されているものである．上記著作権者お
+ *  よびTOPPERSプロジェクトは，本ソフトウェアに関して，特定の使用目的
+ *  に対する適合性も含めて，いかなる保証も行わない．また，本ソフトウェ
+ *  アの利用により直接的または間接的に生じたいかなる損害に関しても，そ
+ *  の責任を負わない．
+ *
+ *  $Id$
+ */
 #ifndef _ESP_AT_SOCKET_H_

azure_iot_hub_riscv/trunk/app_iothub_client/src/kpu_main.c

-              r453
+              r458
 *  の責任を負わない．
+*
 *  $Id$
+*  $Id: kpu_main.c 2176 2020-08-19 23:50:05Z coas-nagasima $
 */
 …
 #include "kpu.h"
 #include "region_layer.h"
-extern void ai_dma_done_isr(DMA_Handle_t *dma);
 /*
 …
 static uint32_t lable_string_draw_ram[115 * 16 * 8 / 2];
 #endif
 extern uint8_t model_data[];
+extern const uint8_t model_data[];
 kpu_model_context_t g_task;
 static region_layer_t detect_rl;
 …
         OV2640_t        *hcmr;
         DVP_Handle_t    *hdvp;
-        uint16_t        *lcd_buffer;
         ER_UINT ercd;
         uint32_t i, count;
+        uint32_t i;
         struct tm2 time;
         unsigned long atmp;
 …
         syslog_1(LOG_NOTICE, "OV2640 id(%d)", ov2640_id(hcmr));
         Init.WorkMode     = SPI_WORK_MODE_0;
+        Init.WorkMode     = SPI_WORK_MODE_2;
         Init.FrameFormat  = SPI_FF_OCTAL;
         Init.DataSize     = 8;
 …
         Init.SsPin        = SIPEED_ST7789_SS_PIN;
         Init.SsNo         = SIPEED_ST7789_SS;
         Init.TxDMAChannel = SIPEED_DMA_CH;
+        Init.TxDMAChannel = -1;
         Init.RxDMAChannel = -1;
         Init.semid        = SPI1TRN_SEM;
 …
         lcd_init(hlcd);
         syslog_2(LOG_NOTICE, "width(%d) height(%d)", hlcd->_width, hlcd->_height);
+        count = hcmr->_width * hcmr->_height;
+        lcd_buffer = (uint16_t *)malloc(count * 2);
+        if(lcd_buffer == NULL){
+                syslog_0(LOG_ERROR, "no lcd buffer !");
+                slp_tsk();
+        }
         DrawProp.BackColor = ST7789_WHITE;
         DrawProp.TextColor = ST7789_BLACK;
 …
                 syslog_0(LOG_ERROR, "SD-CARD INITAIL ERROR !");
 #endif
+        extern DMA_Handle_t g_ai_hdma;
+        g_ai_hdma.chnum = AI_DMA_CH;
+        g_ai_hdma.xfercallback = ai_dma_done_isr;
+        g_ai_hdma.errorcallback     = NULL;
+        g_ai_hdma.Init.Request      = DMA_SELECT_AI_RX_REQ;     /* DMA選択 */
+        g_ai_hdma.Init.Direction    = DMA_PERIPH_TO_MEMORY;     /* DMA転送方向 */
+        g_ai_hdma.Init.SrcMultBlock = DMAC_MULTBLOCK_CONT;      /* ソースマルチブロックタイプ */
+        g_ai_hdma.Init.DrcMultBlock = DMAC_MULTBLOCK_CONT;      /* デスティネーションマルチブロックタイプ */
+        g_ai_hdma.Init.SrcHandShake = DMAC_HS_HARDWARE; /* ソースハンドシェイク */
+        g_ai_hdma.Init.DrcHandShake = DMAC_HS_SOFTWARE; /* デスティネーションハンドシェイク */
+        g_ai_hdma.Init.SrcHwhsPol   = DMAC_HWHS_POLARITY_LOW;   /* ソースハードウェアハンドシェイク極性 */
+        g_ai_hdma.Init.DrcHwhsPol   = DMAC_HWHS_POLARITY_LOW;   /* デスティネーションハードウェアハンドシェイク極性 */
+        g_ai_hdma.Init.Priority     = 4;        /* 優先度 */
+        g_ai_hdma.Init.SrcMaster    = DMAC_MASTER1;     /* ソースマスター設定 */
+        g_ai_hdma.Init.DstMaster    = DMAC_MASTER2;     /* デスティネーションマスター設定 */
+        g_ai_hdma.Init.SrcInc       = DMAC_ADDR_NOCHANGE;       /* ソースインクリメント設定 */
+        g_ai_hdma.Init.DstInc       = DMAC_ADDR_INCREMENT;      /* デスティネーションインクリメント設定 */
+        g_ai_hdma.Init.SrcTransWidth = DMAC_TRANS_WIDTH_32;     /* ソース転送幅 */
+        g_ai_hdma.Init.DstTransWidth = DMAC_TRANS_WIDTH_32;     /* デスティネーション転送幅 */
+        g_ai_hdma.Init.SrcBurstSize = DMAC_MSIZE_4;     /* ソースバーストサイズ */
+        g_ai_hdma.Init.DstBurstSize = DMAC_MSIZE_4;     /* デスティネーションバーストサイズ */
+        g_ai_hdma.Init.IocBlkTrans  = 0;        /* IOCブロック転送 */
+        g_ai_hdma.localdata         = (void *)&g_task;
+        if ((ercd = dma_init(&g_ai_hdma)) != E_OK) {
+                syslog_0(LOG_ERROR, "AI-DMA INITAIL ERROR !");
+        if ((ercd = kpu_init(&g_task)) != E_OK) {
+                syslog_0(LOG_ERROR, "kpu init error");
+                slp_tsk();
+        }
 …
         detect_rl.threshold = 0.7;
         detect_rl.nms_value = 0.3;
+        region_layer_init(&detect_rl, 10, 7, 125, 320, 240);
+        if (region_layer_init(&detect_rl, 10, 7, 125, 320, 240) != 0) {
+                syslog_0(LOG_ERROR, "region layer init error");
+                slp_tsk();
+        }
         bool_t camok = true;
 …
                         atmp = (unsigned long)hcmr->_aiBuffer - IOMEM;
                         kpu_run_kmodel(&g_task, (const uint8_t *)atmp, AI_DMA_CH, ai_done, NULL);
-                        uint32_t *p = (uint32_t *)hcmr->_dataBuffer;
-                        uint32_t *q = (uint32_t *)lcd_buffer;
-                        uint32_t *e = (uint32_t *)&lcd_buffer[count];
-                        for (; q < e ; p++, q++){
-                                *q = SWAP_32(*p);
+                        }
+                }
 …
+                }
                 lcd_drawPicture(hlcd, 0, 0, hcmr->_width, hcmr->_height, lcd_buffer);
+                lcd_drawPicture(hlcd, 0, 0, hcmr->_width, hcmr->_height, (uint16_t *)hcmr->_dataBuffer);
                 /* draw boxs */

azure_iot_hub_riscv/trunk/app_iothub_client/src/kpu_main.h

-              r453
+              r458
  *  の責任を負わない．
+ *
  *  $Id$
+ *  $Id: kpu_main.h 2173 2020-08-19 05:33:00Z coas-nagasima $
  */
 …
  */
+#define KPU_PRIORITY    5               /* メインタスクの優先度 */
+                                                                /* HIGH_PRIORITYより高くすること */
+#define KPU_PRIORITY    6               /* メインタスクの優先度 */
 /*
 …
 #define INHNO_SPI     IRQ_VECTOR_SPI0   /* 割込みハンドラ番号 */
 #define INTNO_SPI     IRQ_VECTOR_SPI0   /* 割込み番号 */
 #define INTPRI_SPI    -5                /* 割込み優先度 */
+#define INTPRI_SPI    -6                /* 割込み優先度 */
 #define INTATR_SPI    0                 /* 割込み属性 */
 …
 #define INHNO_DMATX   IRQ_VECTOR_DMA3   /* 割込みハンドラ番号 */
 #define INTNO_DMATX   IRQ_VECTOR_DMA3   /* 割込み番号 */
 #define INTPRI_DMATX  -4                /* 割込み優先度 */
+#define INTPRI_DMATX  -6                /* 割込み優先度 */
 #define INTATR_DMATX  0                 /* 割込み属性 */
 …
 #define INHNO_SPIC    IRQ_VECTOR_SPI1   /* 割込みハンドラ番号 */
 #define INTNO_SPIC    IRQ_VECTOR_SPI1   /* 割込み番号 */
 #define INTPRI_SPIC   -5                /* 割込み優先度 */
+#define INTPRI_SPIC   -6                /* 割込み優先度 */
 #define INTATR_SPIC   0                 /* 割込み属性 */
 …
 #define INHNO_DMARX   IRQ_VECTOR_DMA2   /* 割込みハンドラ番号 */
 #define INTNO_DMARX   IRQ_VECTOR_DMA2   /* 割込み番号 */
 #define INTPRI_DMARX  -4                /* 割込み優先度 */
+#define INTPRI_DMARX  -6                /* 割込み優先度 */
 #define INTATR_DMARX  0                 /* 割込み属性 */
 #define INHNO_AI      IRQ_VECTOR_AI     /* 割込みハンドラ番号 */
 #define INTNO_AI      IRQ_VECTOR_AI     /* 割込み番号 */
 #define INTPRI_AI     -4                /* 割込み優先度 */
+#define INTPRI_AI     -7                /* 割込み優先度 */
 #define INTATR_AI     0                 /* 割込み属性 */
 …
 #define INHNO_DMAAI   IRQ_VECTOR_DMA5   /* 割込みハンドラ番号 */
 #define INTNO_DMAAI   IRQ_VECTOR_DMA5   /* 割込み番号 */
 #define INTPRI_DMAAI  -4                /* 割込み優先度 */
+#define INTPRI_DMAAI  -7                /* 割込み優先度 */
 #define INTATR_DMAAI  0                 /* 割込み属性 */

azure_iot_hub_riscv/trunk/app_iothub_client/src/main.cfg

-              r454
+              r458
 INCLUDE("syssvc/serial.cfg");
 INCLUDE("syssvc/logtask.cfg");
+INCLUDE("syssvc/malloc.cfg");
 INCLUDE("pdic/k210/device.cfg");
 INCLUDE("pdic/k210/dvp.cfg");
 …
 CRE_TSK(MAIN_TASK, { TA_ACT, 0, main_task, MAIN_PRIORITY, STACK_SIZE, NULL });
 CRE_TSK(KPU_TASK, { TA_NULL, 0, kpu_task, KPU_PRIORITY, KPU_STACK_SIZE, NULL });
+CRE_TSK(KPU_TASK, { TA_ACT, 0, kpu_task, KPU_PRIORITY, KPU_STACK_SIZE, NULL });
 ATT_ISR({TA_NULL, SPI_PORTID, INTNO_SPI, spi_isr, 1 });
 …
 ATT_ISR({TA_NULL, 0, INTNO_AI, ai_done_isr, 1 });
 CFG_INT(INTNO_AI, { TA_ENAINT | INTATR_AI, INTPRI_AI });
+CFG_INT(INTNO_AI, { INTATR_AI, INTPRI_AI });
 ATT_ISR({TA_NULL, AI_DMA_CH, INTNO_DMAAI, channel_dmac_isr, 1 });
 CFG_INT(INTNO_DMAAI, { TA_ENAINT | INTATR_DMAAI, INTPRI_DMAAI });

azure_iot_hub_riscv/trunk/app_iothub_client/src/main.h

-              r453
+              r458
  */
+#define MAIN_PRIORITY   8               /* メインタスクの優先度 */
+                                                                /* HIGH_PRIORITYより高くすること */
+#define MAIN_PRIORITY   5               /* メインタスクの優先度 */
 /*

Context Navigation

Legend:

azure_iot_hub_riscv/trunk/app_iothub_client/.vscode/tasks.json

azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/atomic.h

azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/incbin.h

azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/kpu.c

azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/kpu.h

azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/region_layer.c

azure_iot_hub_riscv/trunk/app_iothub_client/src/command.c

azure_iot_hub_riscv/trunk/app_iothub_client/src/envcmd.c

azure_iot_hub_riscv/trunk/app_iothub_client/src/esp_at_socket.c

azure_iot_hub_riscv/trunk/app_iothub_client/src/esp_at_socket.h

azure_iot_hub_riscv/trunk/app_iothub_client/src/kpu_main.c

azure_iot_hub_riscv/trunk/app_iothub_client/src/kpu_main.h

azure_iot_hub_riscv/trunk/app_iothub_client/src/main.cfg

azure_iot_hub_riscv/trunk/app_iothub_client/src/main.h

Download in other formats: