Support KModel V4

2019-07-24 15:24:18 +08:00 · 2019-07-24 15:24:18 +08:00 · 6c201580a0
parent 49d25c8f4a
commit 6c201580a0
42 changed files with 5141 additions and 832 deletions
--- a/.gitignore
+++ b/.gitignore
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@ -22,6 +22,11 @@ ENDIF ()
 # definitions in macros
 add_definitions(-DCONFIG_LOG_LEVEL=LOG_VERBOSE -DCONFIG_LOG_ENABLE -DCONFIG_LOG_COLORS -DLOG_KERNEL -D__riscv64 -DLV_CONF_INCLUDE_SIMPLE)

+# xtl options
+add_definitions(-DTCB_SPAN_NO_EXCEPTIONS -DTCB_SPAN_NO_CONTRACT_CHECKING)
+# nncase options
+add_definitions(-DNNCASE_TARGET=k210)
+
 if (NOT SDK_ROOT)
    get_filename_component(_SDK_ROOT ${CMAKE_CURRENT_LIST_DIR} DIRECTORY)
    global_set(SDK_ROOT ${_SDK_ROOT})
--- a/cmake/compile-flags.cmake
+++ b/cmake/compile-flags.cmake
@ -40,6 +40,7 @@ if (BUILDING_SDK)
            -Wno-error=unused-but-set-variable
            -Wno-error=unused-variable
            -Wno-error=deprecated-declarations
+            -Wno-multichar
            -Wextra
            -Werror=frame-larger-than=32768
            -Wno-unused-parameter
--- a/lds/kendryte.ld
+++ b/lds/kendryte.ld
@ -103,7 +103,7 @@ SECTIONS
  {
    PROVIDE_HIDDEN (__init_array_start = .);
    KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
-    KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors))
+    *(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors)
    PROVIDE_HIDDEN (__init_array_end = .);
  } >ram AT>ram :ram_ro

--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@ -5,8 +5,8 @@
 FILE(GLOB_RECURSE LIB_SRC
        "${CMAKE_CURRENT_LIST_DIR}/*.h"
        "${CMAKE_CURRENT_LIST_DIR}/*.hpp"
-        "${CMAKE_CURRENT_LIST_DIR}/*.cpp"
        "${CMAKE_CURRENT_LIST_DIR}/*.c"
+        "${CMAKE_CURRENT_LIST_DIR}/*.cpp"
        "${CMAKE_CURRENT_LIST_DIR}/*.s"
        "${CMAKE_CURRENT_LIST_DIR}/*.S"
        )
@ -16,7 +16,8 @@ FILE(GLOB_RECURSE ASSEMBLY_FILES
        "${CMAKE_CURRENT_LIST_DIR}/*.S"
        )

-include_directories(${CMAKE_CURRENT_LIST_DIR}/drivers/include ${CMAKE_CURRENT_LIST_DIR}/bsp/include)
+include_directories(${SDK_ROOT}/third_party/xtl/include)
+include_directories(${CMAKE_CURRENT_LIST_DIR}/drivers/include ${CMAKE_CURRENT_LIST_DIR}/bsp/include ${CMAKE_CURRENT_LIST_DIR}/nncase/include)

 SET_PROPERTY(SOURCE ${ASSEMBLY_FILES} PROPERTY LANGUAGE C)
 SET_SOURCE_FILES_PROPERTIES(${ASSEMBLY_FILES} PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp -D __riscv64")
--- a/lib/drivers/include/kpu.h
+++ b/lib/drivers/include/kpu.h
@ -663,18 +663,31 @@ typedef void (*kpu_done_callback_t)(void *userdata);

 typedef struct
 {
-    const uint8_t *model_buffer;
-    uint8_t *main_buffer;
-    uint32_t output_count;
-    const kpu_model_output_t *outputs;
-    const kpu_model_layer_header_t *layer_headers;
-    const uint8_t *body_start;
-    uint32_t layers_length;
-    volatile uint32_t current_layer;
-    const uint8_t *volatile current_body;
-    dmac_channel_number_t dma_ch;
-    kpu_done_callback_t done_callback;
-    void *userdata;
+    int is_nncase;
+
+    union
+    {
+        struct
+        {
+            const uint8_t *model_buffer;
+            uint8_t *main_buffer;
+            uint32_t output_count;
+            const kpu_model_output_t *outputs;
+            const kpu_model_layer_header_t *layer_headers;
+            const uint8_t *body_start;
+            uint32_t layers_length;
+            volatile uint32_t current_layer;
+            const uint8_t *volatile current_body;
+            dmac_channel_number_t dma_ch;
+            kpu_done_callback_t done_callback;
+            void *userdata;
+        };
+
+        struct
+        {
+            void* nncase_ctx;
+        };
+    };
 } kpu_model_context_t;

 typedef struct
--- a/lib/drivers/kpu.c
+++ b/lib/drivers/kpu.c
@ -10,6 +10,7 @@
 #include "dmac.h"
 #include "kpu.h"
 #include "printf.h"
+#include "nncase.h"

 #define LAYER_BURST_SIZE 12

@ -1361,6 +1362,7 @@ int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)

    if(header->version == 3 && header->arch == 0)
    {
+        ctx->is_nncase = 0;
        ctx->model_buffer = buffer;
        ctx->output_count = header->output_count;
        ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t));
@ -1370,6 +1372,9 @@ int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
        ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage);
        if(!ctx->main_buffer)
            return -1;
+    } else if(header->version == 'KMDL')
+    {
+        return nncase_load_kmodel(ctx, buffer);
    } else
    {
        return -1;
@ -1380,6 +1385,9 @@ int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)

 int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
 {
+    if(ctx->is_nncase)
+        return nncase_get_output(ctx, index, data, size);
+
    if(index >= ctx->output_count)
        return -1;

@ -1391,6 +1399,9 @@ int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, siz

 void kpu_model_free(kpu_model_context_t *ctx)
 {
+    if(ctx->is_nncase)
+        return nncase_model_free(ctx);
+
    free(ctx->main_buffer);
    ctx->main_buffer = NULL;
 }
@ -1595,6 +1606,9 @@ static void ai_step_not_isr(void *userdata)

 int kpu_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
 {
+    if(ctx->is_nncase)
+        return nncase_run_kmodel(ctx, src, dma_ch, done_callback, userdata);
+
    ctx->dma_ch = dma_ch;
    ctx->done_callback = done_callback;
    ctx->userdata = userdata;
--- a/lib/nncase/.clang-format
+++ b/lib/nncase/.clang-format
@ -0,0 +1,8 @@
+---
+BasedOnStyle: WebKit
+BreakBeforeBraces: Allman
+ConstructorInitializerAllOnOneLineOrOnePerLine: 'true'
+UseTab: Never
+PointerAlignment: Right
+
+...
--- a/lib/nncase/include/datatypes.h
+++ b/lib/nncase/include/datatypes.h
@ -0,0 +1,97 @@
+#pragma once
+#include <array>
+#include <optional>
+#include <stdint.h>
+
+namespace nncase
+{
+typedef enum _datatype
+{
+    dt_float32,
+    dt_uint8
+} datatype_t;
+
+struct padding
+{
+    int32_t before;
+    int32_t after;
+
+    int32_t sum() const noexcept { return before + after; }
+
+    static padding zero() noexcept { return {}; }
+};
+
+template <class T>
+struct value_range
+{
+    T min;
+    T max;
+};
+
+typedef enum _reduce_op
+{
+    reduce_mean,
+    reduce_min,
+    reduce_max
+} reduce_op_t;
+
+typedef enum _binary_op
+{
+    binary_add,
+    binary_sub,
+    binary_mul,
+    binary_div
+} binary_op_t;
+
+typedef struct _quant_param
+{
+    int32_t zero_point;
+    float scale;
+} quant_param_t;
+
+inline bool operator==(const quant_param_t &lhs, const quant_param_t &rhs) noexcept
+{
+    return lhs.zero_point == rhs.zero_point && lhs.scale == rhs.scale;
+}
+
+struct fixed_mul
+{
+    float mul;
+    int8_t shift;
+};
+
+typedef enum _memory_type
+{
+    mem_const,
+    mem_main,
+    mem_k210_kpu
+} memory_type_t;
+
+using runtime_shape_t = std::array<int, 4>;
+using runtime_paddings_t = std::array<padding, 4>;
+
+struct scalar
+{
+    datatype_t type;
+    std::array<uint8_t, 4> storage;
+
+    scalar() = default;
+
+    template <class T>
+    scalar(T &&value) { as<T>() = value; }
+
+    template <class T>
+    T &as() noexcept { return *reinterpret_cast<T *>(storage.data()); }
+
+    template <class T>
+    const T &as() const noexcept { return *reinterpret_cast<const T *>(storage.data()); }
+};
+
+struct memory_range
+{
+    memory_type_t memory_type;
+    datatype_t datatype;
+    uint32_t start;
+    uint32_t size;
+};
+}
--- a/lib/nncase/include/kernels/cpu/cpu_kernels.h
+++ b/lib/nncase/include/kernels/cpu/cpu_kernels.h
@ -0,0 +1,257 @@
+#pragma once
+#include "../utils.h"
+#include <runtime_op_utility.h>
+
+namespace nncase
+{
+namespace kernels
+{
+    namespace cpu
+    {
+        inline void conv2d(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape,
+            int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
+            const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
+        {
+            const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
+            const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
+
+            for (int batch = 0; batch < in_shape[0]; batch++)
+            {
+                auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
+
+                for (int oy = 0; oy < out_h; oy++)
+                {
+                    for (int ox = 0; ox < out_w; ox++)
+                    {
+                        int in_y_origin = (oy * stride_h) - padding_h.before;
+                        int in_x_origin = (ox * stride_w) - padding_w.before;
+                        int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
+                        int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
+                        int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
+                        int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
+
+                        for (int oc = 0; oc < out_channels; oc++)
+                        {
+                            auto w_oc = weights + (size_t)oc * filter_h * filter_w * in_shape[3];
+                            float value = bias[oc];
+
+                            for (int ky = filter_y_start; ky < filter_y_end; ky++)
+                            {
+                                for (int kx = filter_xSstart; kx < filter_x_end; kx++)
+                                {
+                                    int in_y = in_y_origin + dilation_h * ky;
+                                    int in_x = in_x_origin + dilation_w * kx;
+
+                                    auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
+                                    auto w_pix = w_oc + ((size_t)ky * filter_w + kx) * in_shape[3];
+
+                                    for (int ic = 0; ic < in_shape[3]; ic++)
+                                        value += in_pix[ic] * w_pix[ic];
+                                }
+                            }
+
+                            *output++ = details::apply_activation(value, fused_activation);
+                        }
+                    }
+                }
+            }
+        }
+
+        inline void depthwise_conv2d(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape,
+            int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
+            const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
+        {
+            const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
+            const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
+
+            for (int batch = 0; batch < in_shape[0]; batch++)
+            {
+                auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
+
+                for (int oy = 0; oy < out_h; oy++)
+                {
+                    for (int ox = 0; ox < out_w; ox++)
+                    {
+                        int in_y_origin = (oy * stride_h) - padding_h.before;
+                        int in_x_origin = (ox * stride_w) - padding_w.before;
+                        int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
+                        int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
+                        int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
+                        int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
+
+                        for (int oc = 0; oc < in_shape[3]; oc++)
+                        {
+                            auto w_oc = weights + (size_t)oc * filter_h * filter_w;
+                            float value = bias[oc];
+
+                            for (int ky = filter_y_start; ky < filter_y_end; ky++)
+                            {
+                                for (int kx = filter_xSstart; kx < filter_x_end; kx++)
+                                {
+                                    int in_y = in_y_origin + dilation_h * ky;
+                                    int in_x = in_x_origin + dilation_w * kx;
+
+                                    auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
+                                    auto w_pix = w_oc + ((size_t)ky * filter_w + kx);
+
+                                    value += in_pix[oc] * w_pix[0];
+                                }
+                            }
+
+                            *output++ = details::apply_activation(value, fused_activation);
+                        }
+                    }
+                }
+            }
+        }
+
+        template <class TBinaryOp, class TOutputOp>
+        void reduce_window2d(const float *input, float *output, float init_value, const runtime_shape_t &in_shape,
+            int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
+            const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation, TBinaryOp &&binary_op, TOutputOp &&window_op)
+        {
+            const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
+            const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
+
+            for (int batch = 0; batch < in_shape[0]; batch++)
+            {
+                auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
+
+                for (int oy = 0; oy < out_h; oy++)
+                {
+                    for (int ox = 0; ox < out_w; ox++)
+                    {
+                        int in_y_origin = (oy * stride_h) - padding_h.before;
+                        int in_x_origin = (ox * stride_w) - padding_w.before;
+                        int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
+                        int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
+                        int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
+                        int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
+
+                        for (int oc = 0; oc < in_shape[3]; oc++)
+                        {
+                            float value = init_value;
+                            int32_t kernel_count = 0;
+
+                            for (int ky = filter_y_start; ky < filter_y_end; ky++)
+                            {
+                                for (int kx = filter_xSstart; kx < filter_x_end; kx++)
+                                {
+                                    int in_y = in_y_origin + dilation_h * ky;
+                                    int in_x = in_x_origin + dilation_w * kx;
+
+                                    auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
+
+                                    value = binary_op(value, in_pix[oc]);
+                                    kernel_count++;
+                                }
+                            }
+
+                            *output++ = details::apply_activation(window_op(value, kernel_count), fused_activation);
+                        }
+                    }
+                }
+            }
+        }
+
+        inline void quantized_conv2d(const uint8_t *input, uint8_t *output, const uint8_t *weights, const int32_t *bias, const runtime_shape_t &in_shape,
+            int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
+            const padding &padding_h, const padding &padding_w, int32_t input_offset, int32_t filter_offset, int32_t output_mul, int32_t output_shift, int32_t output_offset)
+        {
+            const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
+            const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
+
+            for (int batch = 0; batch < in_shape[0]; batch++)
+            {
+                auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
+
+                for (int oy = 0; oy < out_h; oy++)
+                {
+                    for (int ox = 0; ox < out_w; ox++)
+                    {
+                        int in_y_origin = (oy * stride_h) - padding_h.before;
+                        int in_x_origin = (ox * stride_w) - padding_w.before;
+                        int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
+                        int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
+                        int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
+                        int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
+
+                        for (int oc = 0; oc < out_channels; oc++)
+                        {
+                            auto w_oc = weights + (size_t)oc * filter_h * filter_w * in_shape[3];
+                            int32_t value = bias[oc];
+
+                            for (int ky = filter_y_start; ky < filter_y_end; ky++)
+                            {
+                                for (int kx = filter_xSstart; kx < filter_x_end; kx++)
+                                {
+                                    int in_y = in_y_origin + dilation_h * ky;
+                                    int in_x = in_x_origin + dilation_w * kx;
+
+                                    auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
+                                    auto w_pix = w_oc + ((size_t)ky * filter_w + kx) * in_shape[3];
+
+                                    for (int ic = 0; ic < in_shape[3]; ic++)
+                                        value += (in_pix[ic] - input_offset) * (w_pix[ic] - filter_offset);
+                                }
+                            }
+
+                            value = runtime::mul_and_carry_shift(value, output_mul, output_shift) + output_offset;
+                            *output++ = (uint8_t)std::clamp(value, 0, 255);
+                        }
+                    }
+                }
+            }
+        }
+
+        inline void quantized_depthwise_conv2d(const uint8_t *input, uint8_t *output, const uint8_t *weights, const int32_t *bias, const runtime_shape_t &in_shape,
+            int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
+            const padding &padding_h, const padding &padding_w, int32_t input_offset, int32_t filter_offset, int32_t output_mul, int32_t output_shift, int32_t output_offset)
+        {
+            const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
+            const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
+
+            for (int batch = 0; batch < in_shape[0]; batch++)
+            {
+                auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
+
+                for (int oy = 0; oy < out_h; oy++)
+                {
+                    for (int ox = 0; ox < out_w; ox++)
+                    {
+                        int in_y_origin = (oy * stride_h) - padding_h.before;
+                        int in_x_origin = (ox * stride_w) - padding_w.before;
+                        int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
+                        int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
+                        int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
+                        int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
+
+                        for (int oc = 0; oc < in_shape[3]; oc++)
+                        {
+                            auto w_oc = weights + (size_t)oc * filter_h * filter_w;
+                            int32_t value = bias[oc];
+
+                            for (int ky = filter_y_start; ky < filter_y_end; ky++)
+                            {
+                                for (int kx = filter_xSstart; kx < filter_x_end; kx++)
+                                {
+                                    int in_y = in_y_origin + dilation_h * ky;
+                                    int in_x = in_x_origin + dilation_w * kx;
+
+                                    auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
+                                    auto w_pix = w_oc + ((size_t)ky * filter_w + kx);
+
+                                    value += (in_pix[oc] - input_offset) * (w_pix[0] - filter_offset);
+                                }
+                            }
+
+                            value = runtime::mul_and_carry_shift(value, output_mul, output_shift) + output_offset;
+                            *output++ = (uint8_t)std::clamp(value, 0, 255);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+}
--- a/lib/nncase/include/kernels/k210/k210_kernels.h
+++ b/lib/nncase/include/kernels/k210/k210_kernels.h
@ -0,0 +1,256 @@
+#pragma once
+#include "../utils.h"
+#include <runtime_op_utility.h>
+#include <targets/k210/k210_runtime_op_utility.h>
+
+namespace nncase
+{
+namespace kernels
+{
+    namespace k210
+    {
+        inline void kpu_upload(const uint8_t *src, uint8_t *dest, const runtime_shape_t &in_shape)
+        {
+            if (in_shape[3] % 64 == 0)
+            {
+                std::copy(src, src + kernels::details::compute_size(in_shape), dest);
+            }
+            else
+            {
+                auto layout = targets::k210::get_kpu_row_layout(in_shape[3]);
+                auto fmap_size = targets::k210::get_kpu_bytes(in_shape[3], in_shape[2], in_shape[1]);
+
+                for (int32_t batch = 0; batch < in_shape[0]; batch++)
+                {
+                    auto batch_origin = dest + (size_t)batch * fmap_size;
+                    for (int32_t oc = 0; oc < in_shape[1]; oc++)
+                    {
+                        auto channel_origin = batch_origin + (size_t)oc / layout.groups * layout.row_len * in_shape[2] * 64 + (size_t)oc % layout.groups * layout.row_pitch;
+                        for (int32_t y = 0; y < in_shape[2]; y++)
+                        {
+                            auto y_origin = channel_origin + (size_t)y * layout.row_len * 64;
+                            std::copy(src, src + in_shape[3], y_origin);
+                            src += in_shape[3];
+                        }
+                    }
+                }
+            }
+        }
+
+#if NNCASE_TARGET_K210_SIMULATOR
+
+        inline void kpu_download(const uint8_t *src, uint8_t *dest, const runtime_shape_t &in_shape)
+        {
+            if (in_shape[3] % 64 == 0)
+            {
+                std::copy(src, src + kernels::details::compute_size(in_shape), dest);
+            }
+            else
+            {
+                auto layout = targets::k210::get_kpu_row_layout(in_shape[3]);
+                auto fmap_size = targets::k210::get_kpu_bytes(in_shape[3], in_shape[2], in_shape[1]);
+
+                for (int32_t batch = 0; batch < in_shape[0]; batch++)
+                {
+                    auto batch_origin = src + (size_t)batch * fmap_size;
+                    for (int32_t oc = 0; oc < in_shape[1]; oc++)
+                    {
+                        auto channel_origin = batch_origin + (size_t)oc / layout.groups * layout.row_len * in_shape[2] * 64 + (size_t)oc % layout.groups * layout.row_pitch;
+                        for (int32_t y = 0; y < in_shape[2]; y++)
+                        {
+                            auto y_origin = channel_origin + (size_t)y * layout.row_len * 64;
+                            for (int32_t x = 0; x < in_shape[3]; x++)
+                                *dest++ = y_origin[x];
+                        }
+                    }
+                }
+            }
+        }
+
+        template <bool IsDepthwise, int32_t FilterSize>
+        void kpu_conv2d(const uint8_t *input, int64_t *workspace, uint8_t *output, const uint8_t *weights, int32_t in_h, int32_t in_w, int32_t in_channels, int32_t out_channels, uint8_t pad_value, int32_t arg_x,
+            int32_t shift_x, int32_t arg_w, int32_t shift_w, int64_t arg_add, const targets::k210::kpu_batchnorm_segment *batchnorm, const targets::k210::kpu_activation_table_t &activation)
+        {
+            const auto channel_size = size_t(in_h) * in_w;
+            // conv
+            {
+                auto out_it = workspace;
+                const auto pad = FilterSize == 1 ? 0 : 1;
+                const auto groups = IsDepthwise ? out_channels : 1;
+                const auto g_ic = IsDepthwise ? 1 : in_channels / groups;
+                const auto g_oc = IsDepthwise ? 1 : out_channels;
+
+                for (int32_t og = 0; og < groups; og++)
+                {
+                    const uint8_t *w_group_p = weights + (size_t)og * g_oc * g_ic * FilterSize * FilterSize;
+
+                    for (int32_t oc = 0; oc < g_oc; oc++)
+                    {
+                        const uint8_t *w_oc_p = w_group_p + (size_t)oc * g_ic * FilterSize * FilterSize;
+
+                        for (int32_t oy = 0; oy < in_h; oy++)
+                        {
+                            for (int32_t ox = 0; ox < in_w; ox++)
+                            {
+                                const int32_t in_y_origin = oy - pad;
+                                const int32_t in_x_origin = ox - pad;
+                                int64_t value = 0;
+                                int64_t sum_x = 0, sum_w = 0;
+
+                                for (int32_t ic = 0; ic < g_ic; ic++)
+                                {
+                                    const uint8_t *in_c_p = input + ((size_t)og * g_ic + ic) * in_h * in_w;
+                                    const uint8_t *w_ic_p = w_oc_p + (size_t)ic * FilterSize * FilterSize;
+
+                                    for (int32_t ky = 0; ky < FilterSize; ky++)
+                                    {
+                                        for (int32_t kx = 0; kx < FilterSize; kx++)
+                                        {
+                                            const int32_t in_y = in_y_origin + ky;
+                                            const int32_t in_x = in_x_origin + kx;
+
+                                            uint8_t x;
+                                            if (in_x < 0 || in_x >= in_w
+                                                || in_y < 0 || in_y >= in_h)
+                                                x = pad_value;
+                                            else
+                                                x = in_c_p[in_y * in_w + in_x];
+
+                                            uint8_t w = w_ic_p[ky * FilterSize + kx];
+
+                                            sum_x += x;
+                                            sum_w += w;
+                                            value += (int32_t)x * w;
+                                        }
+                                    }
+                                }
+
+                                *out_it++ = value + (arg_x * sum_x >> shift_x) + (arg_w * sum_w >> shift_w) + arg_add * g_ic;
+                            }
+                        }
+                    }
+                }
+            }
+
+            // bn act
+            {
+                auto src_it = workspace;
+                auto out_it = output;
+                for (int32_t oc = 0; oc < out_channels; oc++)
+                {
+                    const auto &bn = batchnorm[oc];
+                    for (size_t i = 0; i < channel_size; i++)
+                    {
+                        auto value = (*src_it++ * bn.mul >> bn.shift) + bn.add;
+                        auto &seg = *std::find_if(activation.rbegin(), activation.rend(), [value](const targets::k210::kpu_activation_segment &seg) {
+                            return value > seg.start_x;
+                        });
+                        value = runtime::carry_shift((value - seg.start_x) * seg.mul, seg.shift);
+                        *out_it++ = (uint8_t)std::clamp(value, int64_t(0), int64_t(255));
+                    }
+                }
+            }
+        }
+
+        inline void kpu_pool2d(const uint8_t *input, uint8_t *output, int32_t in_h, int32_t in_w, int32_t in_channels, targets::k210::kpu_pool_type_t pool_type)
+        {
+            using namespace targets::k210;
+
+            const auto filter = get_kpu_filter_size(pool_type);
+            const auto stride = get_kpu_filter_stride(pool_type);
+            const auto out_h = get_kpu_pool_output_size(in_h, pool_type);
+            const auto out_w = get_kpu_pool_output_size(in_w, pool_type);
+
+            for (int32_t oc = 0; oc < in_channels; oc++)
+            {
+                auto in_c_p = input + (size_t)oc * in_h * in_w;
+
+                for (int32_t oy = 0; oy < out_h; oy++)
+                {
+                    for (int32_t ox = 0; ox < out_w; ox++)
+                    {
+                        const int32_t in_y_origin = oy * stride;
+                        const int32_t in_x_origin = ox * stride;
+                        int32_t value = 0;
+
+                        switch (pool_type)
+                        {
+                        case kpu_pool_bypass:
+                        {
+                            const int32_t in_y = in_y_origin;
+                            const int32_t in_x = in_x_origin;
+
+                            value = in_c_p[in_y * in_w + in_x];
+                            break;
+                        }
+                        case kpu_pool_max_2_s2:
+                        case kpu_pool_max_2_s1:
+                        case kpu_pool_max_4_s4:
+                        {
+                            for (int32_t ky = 0; ky < filter; ky++)
+                            {
+                                for (int32_t kx = 0; kx < filter; kx++)
+                                {
+                                    const int32_t in_y = in_y_origin + ky;
+                                    const int32_t in_x = in_x_origin + kx;
+                                    int32_t in_v;
+
+                                    if (in_y < 0 || in_y >= in_h || in_x < 0 || in_x >= in_w)
+                                        in_v = 0;
+                                    else
+                                        in_v = in_c_p[in_y * in_w + in_x];
+
+                                    value = std::max(value, in_v);
+                                }
+                            }
+
+                            break;
+                        }
+                        case kpu_pool_mean_2_s2:
+                        case kpu_pool_mean_2_s1:
+                        case kpu_pool_mean_4_s4:
+                        {
+                            for (int32_t ky = 0; ky < filter; ky++)
+                            {
+                                for (int32_t kx = 0; kx < filter; kx++)
+                                {
+                                    const int32_t in_y = std::clamp(in_y_origin + ky, 0, in_h - 1);
+                                    const int32_t in_x = std::clamp(in_x_origin + kx, 0, in_w - 1);
+                                    const int32_t in_v = in_c_p[in_y * in_w + in_x];
+
+                                    value += in_v;
+                                }
+                            }
+
+                            value /= filter * filter;
+                            break;
+                        }
+                        case kpu_pool_left_top_2_s2:
+                        case kpu_pool_left_top_4_s4:
+                        case kpu_pool_right_top_2_s2:
+                        {
+                            auto k_off = get_kpu_select_pool_offset(pool_type);
+                            const int32_t in_y = in_y_origin + k_off[0];
+                            const int32_t in_x = in_x_origin + k_off[1];
+                            int32_t in_v;
+
+                            if (in_y < 0 || in_y >= in_h || in_x < 0 || in_x >= in_w)
+                                in_v = 0;
+                            else
+                                in_v = in_c_p[in_y * in_w + in_x];
+
+                            value = in_v;
+                            break;
+                        }
+                        }
+
+                        *output++ = (uint8_t)value;
+                    }
+                }
+            }
+        }
+
+#endif
+    }
+}
+}
--- a/lib/nncase/include/kernels/neutral/neutral_kernels.h
+++ b/lib/nncase/include/kernels/neutral/neutral_kernels.h
@ -0,0 +1,422 @@
+#pragma once
+#include "../utils.h"
+#include <cmath>
+#include <runtime_op_utility.h>
+#include <xtl/xspan.hpp>
+
+namespace nncase
+{
+namespace kernels
+{
+    namespace neutral
+    {
+        template <class TOp>
+        void binary(const float *input_a, const float *input_b, float *output, const runtime_shape_t &in_a_shape,
+            const runtime_shape_t &in_b_shape, const runtime_shape_t &out_shape, const value_range<float> &fused_activation, TOp &&op)
+        {
+            for (int32_t d0 = 0; d0 < out_shape[0]; d0++)
+            {
+                for (int32_t d1 = 0; d1 < out_shape[1]; d1++)
+                {
+                    for (int32_t d2 = 0; d2 < out_shape[2]; d2++)
+                    {
+                        for (int32_t d3 = 0; d3 < out_shape[3]; d3++)
+                        {
+                            runtime_shape_t in_off = { d0, d1, d2, d3 };
+                            const auto in_a_off = kernels::details::get_reduced_offset(in_off, in_a_shape);
+                            const auto in_b_off = kernels::details::get_reduced_offset(in_off, in_b_shape);
+                            const auto a = input_a[offset(in_a_shape, in_a_off)];
+                            const auto b = input_b[offset(in_b_shape, in_b_off)];
+
+                            output[offset(out_shape, in_off)] = kernels::details::apply_activation(op(a, b), fused_activation);
+                        }
+                    }
+                }
+            }
+        }
+
+        template <class TRange, class TPtrGetter = details::default_ptr_getter<uint8_t, TRange>>
+        inline void concat(xtl::span<TRange> inputs, uint8_t *output, xtl::span<const int32_t> concat_dims, size_t inner_size, size_t outer_size, TPtrGetter getter = {})
+        {
+            for (size_t oc = 0; oc < outer_size; oc++)
+            {
+                for (size_t i = 0; i < inputs.size(); i++)
+                {
+                    auto size = inner_size * concat_dims[i];
+                    auto src = getter(inputs[i]) + oc * size;
+                    std::copy(src, src + size, output);
+                    output += size;
+                }
+            }
+        }
+
+        inline void conv2d(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape,
+            int32_t groups, int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
+            const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
+        {
+            const auto out_h = details::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
+            const auto out_w = details::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
+            const auto g_ic = in_shape[1] / groups;
+            const auto g_oc = out_channels / groups;
+
+            for (int32_t batch = 0; batch < in_shape[0]; batch++)
+            {
+                const float *in_batch_p = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
+
+                for (int32_t og = 0; og < groups; og++)
+                {
+                    const float *in_group_p = in_batch_p + (size_t)og * g_ic * in_shape[2] * in_shape[3];
+                    const float *w_group_p = weights + (size_t)og * g_oc * g_ic * filter_h * filter_w;
+
+                    for (int32_t oc = 0; oc < g_oc; oc++)
+                    {
+                        const float *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
+
+                        for (int32_t oy = 0; oy < out_h; oy++)
+                        {
+                            for (int32_t ox = 0; ox < out_w; ox++)
+                            {
+                                const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
+                                const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
+                                const int32_t filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
+                                const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
+                                const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
+                                const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
+                                float value = bias[oc];
+
+                                for (int32_t ic = 0; ic < g_ic; ic++)
+                                {
+                                    const float *in_c_p = in_group_p + (size_t)ic * in_shape[2] * in_shape[3];
+                                    const float *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
+
+                                    for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
+                                    {
+                                        for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
+                                        {
+                                            const int32_t in_y = in_y_origin + dilation_h * ky;
+                                            const int32_t in_x = in_x_origin + dilation_w * kx;
+
+                                            const float in_v = in_c_p[in_y * in_shape[3] + in_x];
+                                            const float w = w_ic_p[ky * filter_w + kx];
+
+                                            value += in_v * w;
+                                        }
+                                    }
+                                }
+
+                                *output++ = details::apply_activation(value, fused_activation);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        template <class TQ>
+        void dequantize(const TQ *input, float *output, size_t count, const quant_param_t &param)
+        {
+            float div = 1.f / param.scale;
+
+            for (size_t i = 0; i < count; i++)
+            {
+                output[i] = (input[i] - param.zero_point) * div;
+            }
+        }
+
+        inline void matmul(const float *input_a, const float *input_b, float *output, const float *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, const value_range<float> &fused_activation)
+        {
+            for (size_t oy = 0; oy < a_rows; oy++)
+            {
+                for (size_t ox = 0; ox < b_cols; ox++)
+                {
+                    float value = bias[ox];
+                    for (size_t i = 0; i < a_cols; i++)
+                    {
+                        const auto a = input_a[oy * a_cols + i];
+                        const auto b = input_b[i * b_cols + ox];
+                        value += a * b;
+                    }
+
+                    output[oy * b_cols + ox] = details::apply_activation(value, fused_activation);
+                }
+            }
+        }
+
+        template <class T>
+        void pad(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_paddings_t &paddings, T pad_value)
+        {
+            runtime_shape_t out_shape = { in_shape[0] + paddings[0].sum(),
+                in_shape[1] + paddings[1].sum(),
+                in_shape[2] + paddings[2].sum(),
+                in_shape[3] + paddings[3].sum() };
+
+            for (int d0 = 0; d0 < out_shape[0]; d0++)
+            {
+                auto d0_origin = -paddings[0].before;
+                auto in0 = input + ((size_t)d0_origin + d0) * in_shape[1] * in_shape[2] * in_shape[3];
+
+                for (int d1 = 0; d1 < out_shape[1]; d1++)
+                {
+                    auto d1_origin = -paddings[1].before;
+                    auto in1 = in0 + ((size_t)d1_origin + d1) * in_shape[2] * in_shape[3];
+
+                    for (int d2 = 0; d2 < out_shape[2]; d2++)
+                    {
+                        auto d2_origin = -paddings[2].before;
+                        auto in2 = in1 + ((size_t)d2_origin + d2) * in_shape[3];
+
+                        for (int d3 = 0; d3 < out_shape[3]; d3++)
+                        {
+                            auto d3_origin = -paddings[3].before;
+
+                            if (d0 < paddings[0].before || d0 >= out_shape[0] - paddings[0].after
+                                || d1 < paddings[1].before || d1 >= out_shape[1] - paddings[1].after
+                                || d2 < paddings[2].before || d2 >= out_shape[2] - paddings[2].after
+                                || d3 < paddings[3].before || d1 >= out_shape[3] - paddings[3].after)
+                                *output++ = pad_value;
+                            else
+                                *output++ = in2[d3_origin + d3];
+                        }
+                    }
+                }
+            }
+        }
+
+        template <class TQ>
+        void quantize(const float *input, TQ *output, size_t count, const quant_param_t &param)
+        {
+            for (size_t i = 0; i < count; i++)
+            {
+                int32_t tmp = (int32_t)roundf(input[i] * param.scale + param.zero_point);
+                output[i] = std::clamp(tmp, (int32_t)std::numeric_limits<TQ>::lowest(), (int32_t)std::numeric_limits<TQ>::max());
+            }
+        }
+
+        template <class TReducer>
+        void reduce(const float *input, float *output, float init_value, const runtime_shape_t &in_shape, const runtime_shape_t &reduced_shape, TReducer &&reducer)
+        {
+            std::fill(output, output + kernels::details::compute_size(reduced_shape), init_value);
+
+            for (int32_t d0 = 0; d0 < in_shape[0]; d0++)
+            {
+                for (int32_t d1 = 0; d1 < in_shape[1]; d1++)
+                {
+                    for (int32_t d2 = 0; d2 < in_shape[2]; d2++)
+                    {
+                        for (int32_t d3 = 0; d3 < in_shape[3]; d3++)
+                        {
+                            runtime_shape_t in_off = { d0, d1, d2, d3 };
+                            auto out_off = kernels::details::get_reduced_offset(in_off, reduced_shape);
+                            const auto a = input[offset(in_shape, in_off)];
+                            auto &b = output[offset(reduced_shape, out_off)];
+                            b = reducer(b, a);
+                        }
+                    }
+                }
+            }
+        }
+
+        template <class TOp>
+        void unary(const float *input, float *output, size_t count, TOp &&op)
+        {
+            for (size_t i = 0; i < count; i++)
+                output[i] = op(input[i]);
+        }
+
+        template <class TBinaryOp, class TOutputOp>
+        void reduce_window2d(const float *input, float *output, float init_value, const runtime_shape_t &in_shape, int32_t filter_h, int32_t filter_w,
+            int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, const padding &padding_h, const padding &padding_w,
+            const value_range<float> &fused_activation, TBinaryOp &&binary_op, TOutputOp &&window_op)
+        {
+            const auto out_h = kernels::details::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
+            const auto out_w = kernels::details::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
+            runtime_shape_t out_shape { in_shape[0], in_shape[1], out_h, out_w };
+
+            for (int32_t batch = 0; batch < in_shape[0]; batch++)
+            {
+                for (int32_t oc = 0; oc < in_shape[1]; oc++)
+                {
+                    for (int32_t oy = 0; oy < out_h; oy++)
+                    {
+                        for (int32_t ox = 0; ox < out_w; ox++)
+                        {
+                            const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
+                            const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
+                            const int32_t filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
+                            const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
+                            const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
+                            const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
+                            float value = init_value;
+                            int32_t kernel_count = 0;
+
+                            for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
+                            {
+                                for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
+                                {
+                                    const int32_t in_y = in_y_origin + dilation_h * ky;
+                                    const int32_t in_x = in_x_origin + dilation_w * kx;
+
+                                    const float in_v = input[offset(in_shape, { batch, oc, in_y, in_x })];
+
+                                    value = binary_op(value, in_v);
+                                    kernel_count++;
+                                }
+                            }
+
+                            output[offset(out_shape, { batch, oc, oy, ox })] = kernels::details::apply_activation(window_op(value, kernel_count), fused_activation);
+                        }
+                    }
+                }
+            }
+        }
+
+        template <class T>
+        void resize_nearest_neighbor(const T *input, T *output, const runtime_shape_t &in_shape, int32_t out_h, int32_t out_w)
+        {
+            auto height_scale = (float)in_shape[2] / out_h;
+            auto width_scale = (float)in_shape[3] / out_w;
+
+            for (int batch = 0; batch < in_shape[0]; batch++)
+            {
+                auto in_batch = input + batch * in_shape[1] * in_shape[2] * in_shape[3];
+
+                for (int oc = 0; oc < in_shape[1]; oc++)
+                {
+                    auto in_c = in_batch + oc * in_shape[2] * in_shape[3];
+
+                    for (int oy = 0; oy < out_h; oy++)
+                    {
+                        auto in_y = std::min((int32_t)floorf(oy * height_scale), in_shape[2] - 1);
+                        auto in_row = in_c + in_y * in_shape[3];
+
+                        for (int ox = 0; ox < out_w; ox++)
+                        {
+                            auto in_x = std::min((int32_t)floorf(ox * width_scale), in_shape[3] - 1);
+                            *output++ = in_row[in_x];
+                        }
+                    }
+                }
+            }
+        }
+
+        inline void resize_bilinear(const float *input, float *output, const runtime_shape_t &in_shape, int32_t out_h, int32_t out_w, bool align_corners)
+        {
+            auto height_scale = (float)in_shape[2] / out_h;
+            auto width_scale = (float)in_shape[3] / out_w;
+            if (align_corners && out_h > 1)
+                height_scale = (float)(in_shape[2] - 1) / (out_h - 1);
+            if (align_corners && out_w > 1)
+                width_scale = (float)(in_shape[3] - 1) / (out_w - 1);
+
+            auto destIdx = 0;
+            for (int batch = 0; batch < in_shape[0]; batch++)
+            {
+                auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
+
+                for (int oc = 0; oc < in_shape[1]; oc++)
+                {
+                    auto in_c = in_batch + (size_t)oc * in_shape[2] * in_shape[3];
+
+                    for (int oy = 0; oy < out_h; oy++)
+                    {
+                        auto in_y = oy * height_scale;
+                        auto in_y0 = (int)floorf(in_y);
+                        auto in_y1 = std::min(in_y0 + 1, in_shape[2] - 1);
+
+                        for (int ox = 0; ox < out_w; ox++)
+                        {
+                            auto in_x = ox * width_scale;
+                            auto in_x0 = (int)floorf(in_x);
+                            auto in_x1 = std::min(in_x0 + 1, in_shape[3] - 1);
+
+                            auto v0 = in_c[in_y0 * in_shape[3] + in_x0];
+                            auto v1 = in_c[in_y1 * in_shape[3] + in_x0];
+                            auto v2 = in_c[in_y0 * in_shape[3] + in_x1];
+                            auto v3 = in_c[in_y1 * in_shape[3] + in_x1];
+
+                            auto a0 = (1 - (in_y - in_y0)) * (1 - (in_x - in_x0));
+                            auto a1 = (in_y - in_y0) * (1 - (in_x - in_x0));
+                            auto a2 = (1 - (in_y - in_y0)) * (in_x - in_x0);
+                            auto a3 = (in_y - in_y0) * (in_x - in_x0);
+
+                            output[destIdx++] = v0 * a0 + v1 * a1 + v2 * a2 + v3 * a3;
+                        }
+                    }
+                }
+            }
+        }
+
+        inline void softmax(const float *input, float *output, float beta, int32_t outer_size, size_t inner_size)
+        {
+            for (size_t batch = 0; batch < outer_size; batch++)
+            {
+                auto src = input + batch * inner_size;
+                auto dest = output + batch * inner_size;
+
+                auto max = *std::max_element(src, src + inner_size);
+                float sum = 0;
+
+                for (size_t i = 0; i < inner_size; i++)
+                {
+                    auto value = expf((src[i] - max) * beta);
+                    sum += value;
+                    dest[i] = value;
+                }
+
+                for (size_t i = 0; i < inner_size; i++)
+                    dest[i] /= sum;
+            }
+        }
+
+        template <class T>
+        void transpose(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &perm)
+        {
+            runtime_shape_t out_shape;
+            for (size_t i = 0; i < 4; i++)
+                out_shape[i] = in_shape[perm[i]];
+
+            runtime_shape_t i, o;
+            for (o[3] = 0; o[3] < out_shape[3]; o[3]++)
+            {
+                i[perm[3]] = o[3];
+                for (o[2] = 0; o[2] < out_shape[2]; o[2]++)
+                {
+                    i[perm[2]] = o[2];
+                    for (o[1] = 0; o[1] < out_shape[1]; o[1]++)
+                    {
+                        i[perm[1]] = o[1];
+                        for (o[0] = 0; o[0] < out_shape[0]; o[0]++)
+                        {
+                            i[perm[0]] = o[0];
+                            output[offset(out_shape, o)] = input[offset(in_shape, i)];
+                        }
+                    }
+                }
+            }
+        }
+
+        template <class T>
+        void strided_slice(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &begin, const runtime_shape_t &end, const runtime_shape_t &strides)
+        {
+            auto loop_cond = [](int32_t i, int32_t stop, int32_t stride) {
+                return stride > 0 ? i < stop : i > stop;
+            };
+
+            for (int32_t d0 = begin[0]; loop_cond(d0, end[0], strides[0]); d0 += strides[0])
+            {
+                auto d0_origin = input + (size_t)d0 * in_shape[1] * in_shape[2] * in_shape[3];
+                for (int d1 = begin[1]; loop_cond(d1, end[1], strides[1]); d1 += strides[1])
+                {
+                    auto d1_origin = d0_origin + (size_t)d1 * in_shape[2] * in_shape[3];
+                    for (int32_t d2 = begin[2]; loop_cond(d2, end[2], strides[2]); d2 += strides[2])
+                    {
+                        auto d2_origin = d1_origin + (size_t)d2 * in_shape[3];
+                        for (int32_t d3 = begin[3]; loop_cond(d3, end[3], strides[3]); d3 += strides[3])
+                            *output++ = d2_origin[d3];
+                    }
+                }
+            }
+        }
+    }
+}
+}
--- a/lib/nncase/include/kernels/utils.h
+++ b/lib/nncase/include/kernels/utils.h
@ -0,0 +1,82 @@
+#pragma once
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <datatypes.h>
+
+namespace nncase
+{
+namespace kernels
+{
+    inline size_t offset(const runtime_shape_t &shape, const runtime_shape_t &index)
+    {
+        return (((size_t)index[0] * shape[1] + index[1]) * shape[2] + index[2]) * shape[3] + index[3];
+    }
+
+    namespace details
+    {
+        inline int32_t get_windowed_output_size(int32_t size, int32_t filter, int32_t stride, int32_t dilation, const padding &padding)
+        {
+            auto effective_filter_size = (filter - 1) * dilation + 1;
+            return (size + padding.before + padding.after - effective_filter_size + stride) / stride;
+        }
+
+        inline size_t compute_size(const runtime_shape_t &shape)
+        {
+            return size_t(shape[0]) * shape[1] * shape[2] * shape[3];
+        }
+
+        template <class T>
+        inline T apply_activation(T value, value_range<T> activation)
+        {
+            return std::clamp(value, activation.min, activation.max);
+        }
+
+        inline runtime_shape_t get_reduced_offset(const runtime_shape_t &in_offset, const runtime_shape_t &reduced_shape)
+        {
+            runtime_shape_t off;
+            for (size_t i = 0; i < in_offset.size(); i++)
+            {
+                if (in_offset[i] >= reduced_shape[i])
+                    off[i] = 0;
+                else
+                    off[i] = in_offset[i];
+            }
+
+            return off;
+        }
+
+        template <class T, class TRange>
+        struct default_ptr_getter
+        {
+            T *operator()(const TRange &range) const noexcept { return range; }
+        };
+
+        template <int32_t Bits>
+        int32_t to_signed(uint32_t value)
+        {
+            auto mask = uint32_t(1) << (Bits - 1);
+            if (Bits != 32 && (value & mask) != 0)
+            {
+                auto sign = 0xFFFFFFFF << Bits;
+                return (int)(value | sign);
+            }
+
+            return (int32_t)value;
+        }
+
+        template <int32_t Bits>
+        int64_t to_signed(uint64_t value)
+        {
+            auto mask = uint64_t(1) << (Bits - 1);
+            if ((value & mask) != 0)
+            {
+                auto sign = 0xFFFFFFFFFFFFFFFF << Bits;
+                return (int64_t)(value | sign);
+            }
+
+            return (int64_t)value;
+        }
+    }
+}
+}
--- a/lib/nncase/include/nncase.h
+++ b/lib/nncase/include/nncase.h
@ -0,0 +1,33 @@
+/* Copyright 2018 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef _NNCASE_H
+#define _NNCASE_H
+
+#include "kpu.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int nncase_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer);
+int nncase_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size);
+void nncase_model_free(kpu_model_context_t *ctx);
+int nncase_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/lib/nncase/include/runtime/binary_writer.h
+++ b/lib/nncase/include/runtime/binary_writer.h
@ -0,0 +1,51 @@
+#pragma once
+#include <iostream>
+#include <xtl/xspan.hpp>
+
+namespace nncase
+{
+namespace runtime
+{
+    class binary_writer
+    {
+    public:
+        binary_writer(std::ostream &stream)
+            : stream_(stream)
+        {
+        }
+
+        template <class T>
+        void write(T &&value)
+        {
+            stream_.write(reinterpret_cast<const char *>(&value), sizeof(value));
+        }
+
+        template <class T>
+        void write_array(xtl::span<const T> value)
+        {
+            stream_.write(reinterpret_cast<const char *>(value.data()), value.size_bytes());
+        }
+
+        std::streampos position() const
+        {
+            return stream_.tellp();
+        }
+
+        void position(std::streampos pos)
+        {
+            stream_.seekp(pos);
+        }
+
+        void align_position(size_t alignment)
+        {
+            auto pos = position();
+            auto rem = pos % alignment;
+            if (rem != 0)
+                position(pos + std::streamoff(alignment - rem));
+        }
+
+    private:
+        std::ostream &stream_;
+    };
+}
+}
--- a/lib/nncase/include/runtime/interpreter.h
+++ b/lib/nncase/include/runtime/interpreter.h
@ -0,0 +1,71 @@
+#pragma once
+#include "model.h"
+#include <chrono>
+#include <memory>
+#include <optional>
+#include <xtl/xspan.hpp>
+
+namespace nncase
+{
+namespace runtime
+{
+    class interpreter_base;
+    typedef void (*run_callback_t)(void *userdata);
+    typedef void (*error_callback_t)(const char *err, void *userdata);
+    typedef void (*node_profile_callback_t)(runtime_opcode op, std::chrono::nanoseconds duration, void *userdata);
+    typedef void (interpreter_base::*interpreter_step_t)();
+
+    class interpreter_base
+    {
+        using clock_t = std::chrono::system_clock;
+
+    public:
+        bool try_load_model(const uint8_t *buffer);
+
+        size_t inputs_size() const noexcept { return model_header_->inputs; }
+        size_t outputs_size() const noexcept { return model_header_->outputs; }
+        size_t nodes_size() const noexcept { return model_header_->nodes; }
+
+        const runtime_shape_t &input_shape_at(size_t index) const noexcept { return input_shapes_.at(index); }
+        const memory_range &input_at(size_t index) const noexcept { return inputs_[index]; }
+        const memory_range &output_at(size_t index) const noexcept { return outputs_[index]; }
+
+        template <class T>
+        xtl::span<T> memory_at(const memory_range &range) const noexcept
+        {
+            auto span = memory_at(range);
+            return { reinterpret_cast<T *>(span.data()), span.size() / sizeof(T) };
+        }
+
+        std::chrono::nanoseconds total_duration() const noexcept { return total_duration_; }
+
+        void run(run_callback_t callback, error_callback_t on_error, node_profile_callback_t node_profile, void *userdata);
+
+    protected:
+        virtual bool initialize();
+        virtual xtl::span<uint8_t> memory_at(const memory_range &range) const noexcept;
+
+    private:
+        void step();
+
+    private:
+        const model_header *model_header_;
+        std::unique_ptr<uint8_t[]> main_mem_;
+        xtl::span<const memory_range> inputs_;
+        xtl::span<const memory_range> outputs_;
+        xtl::span<const runtime_shape_t> input_shapes_;
+        xtl::span<const node_header> node_headers_;
+        xtl::span<const uint8_t> constants_;
+        const uint8_t *node_body_start_;
+        error_callback_t on_error_;
+        run_callback_t run_callback_;
+        node_profile_callback_t node_profile_;
+        void *userdata_;
+        size_t cnt_node_;
+        const uint8_t *cnt_node_body_;
+        std::chrono::nanoseconds total_duration_;
+        std::optional<clock_t::time_point> last_time_;
+        runtime_opcode last_op_;
+    };
+}
+}
--- a/lib/nncase/include/runtime/kernel_registry.h
+++ b/lib/nncase/include/runtime/kernel_registry.h
@ -0,0 +1,20 @@
+#pragma once
+#include "target_config.h"
+#include <datatypes.h>
+#include <runtime/runtime_op.h>
+#include <xtl/xspan.hpp>
+
+namespace nncase
+{
+namespace runtime
+{
+    enum kernel_call_result
+    {
+        kcr_done,
+        kcr_async,
+        kcr_error
+    };
+
+    kernel_call_result call_kernel(runtime_opcode opcode, xtl::span<const uint8_t> body, interpreter_t &interpreter, interpreter_step_t step);
+}
+}
--- a/lib/nncase/include/runtime/model.h
+++ b/lib/nncase/include/runtime/model.h
@ -0,0 +1,38 @@
+#pragma once
+#include "../datatypes.h"
+#include "runtime_op.h"
+
+namespace nncase
+{
+namespace runtime
+{
+    enum model_target : uint32_t
+    {
+        MODEL_TARGET_CPU = 0,
+        MODEL_TARGET_K210 = 1,
+    };
+
+    struct model_header
+    {
+        uint32_t identifier;
+        uint32_t version;
+        uint32_t flags;
+        model_target target;
+        uint32_t constants;
+        uint32_t main_mem;
+        uint32_t nodes;
+        uint32_t inputs;
+        uint32_t outputs;
+        uint32_t reserved0;
+    };
+
+    constexpr uint32_t MODEL_IDENTIFIER = 'KMDL';
+    constexpr uint32_t MODEL_VERSION = 4;
+
+    struct node_header
+    {
+        runtime_opcode opcode;
+        uint32_t body_size;
+    };
+}
+}
--- a/lib/nncase/include/runtime/runtime_op.def
+++ b/lib/nncase/include/runtime/runtime_op.def
@ -0,0 +1,32 @@
+BEGINE_DEFINE_TARGET(neutral)
+    DEFINE_RUNTIME_OP(neutral, binary, Binary, 0)
+    DEFINE_RUNTIME_OP(neutral, concat, Concat, 1)
+    DEFINE_RUNTIME_OP(neutral, conv2d, Conv2D, 2)
+    DEFINE_RUNTIME_OP(neutral, dequantize, Dequantize, 3)
+    DEFINE_RUNTIME_OP(neutral, matmul, MatMul, 4)
+    DEFINE_RUNTIME_OP(neutral, pad, Pad, 5)
+    DEFINE_RUNTIME_OP(neutral, quantize, Quantize, 6)
+    DEFINE_RUNTIME_OP(neutral, reduce, Reduce, 7)
+    DEFINE_RUNTIME_OP(neutral, reduce_window2d, ReduceWindow2D, 8)
+    DEFINE_RUNTIME_OP(neutral, memory_copy, MemoryCopy, 9)
+    DEFINE_RUNTIME_OP(neutral, resize_bilinear, ResizeBilinear, 10)
+    DEFINE_RUNTIME_OP(neutral, resize_nearest_neighbor, ResizeNearestNeighbor, 11)
+    DEFINE_RUNTIME_OP(neutral, softmax, Softmax, 12)
+    DEFINE_RUNTIME_OP(neutral, transpose, Transpose, 13)
+    DEFINE_RUNTIME_OP(neutral, strided_slice, StridedSlice, 14)
+END_DEFINE_TARGET()
+
+// CPU
+BEGINE_DEFINE_TARGET(cpu)
+     DEFINE_RUNTIME_OP(cpu, cpu_conv2d, CPU_CPUConv2D, 1001)
+     DEFINE_RUNTIME_OP(cpu, cpu_depthwise_conv2d, CPU_CPUDepthwiseConv2D, 1002)
+     DEFINE_RUNTIME_OP(cpu, cpu_reduce_window2d, CPU_CPUReduceWindow2D, 1003)
+     DEFINE_RUNTIME_OP(cpu, cpu_quantized_conv2d, CPU_CPUQuantizedConv2D, 1004)
+     DEFINE_RUNTIME_OP(cpu, cpu_quantized_depthwise_conv2d, CPU_CPUQuantizedDepthwiseConv2D, 1005)
+END_DEFINE_TARGET()
+
+// K210
+BEGINE_DEFINE_TARGET(k210)
+     DEFINE_RUNTIME_OP(k210, kpu_upload, K210_KPUUpload, 2001)
+     DEFINE_RUNTIME_OP(k210, kpu_conv2d, K210_KPUConv2D, 2002)
+END_DEFINE_TARGET()
--- a/lib/nncase/include/runtime/runtime_op.h
+++ b/lib/nncase/include/runtime/runtime_op.h
@ -0,0 +1,37 @@
+#pragma once
+#include "../datatypes.h"
+#include <string_view>
+
+namespace nncase
+{
+namespace runtime
+{
+#define BEGINE_DEFINE_TARGET(...)
+#define DEFINE_RUNTIME_OP(target, id, name, value) rop_##id = value,
+#define END_DEFINE_TARGET()
+
+    enum runtime_opcode : uint32_t
+    {
+#include "runtime_op.def"
+    };
+
+#undef DEFINE_RUNTIME_OP
+#define DEFINE_RUNTIME_OP(target, id, name, value) \
+    case rop_##id:                                 \
+        return #name;
+
+    constexpr std::string_view node_opcode_names(runtime_opcode opcode)
+    {
+        switch (opcode)
+        {
+#include "runtime_op.def"
+        default:
+            return {};
+        }
+    }
+
+#undef BEGINE_DEFINE_TARGET
+#undef DEFINE_RUNTIME_OP
+#undef END_DEFINE_TARGET
+}
+}
--- a/lib/nncase/include/runtime/span_reader.h
+++ b/lib/nncase/include/runtime/span_reader.h
@ -0,0 +1,82 @@
+#pragma once
+#include <xtl/xspan.hpp>
+
+namespace nncase
+{
+namespace runtime
+{
+    class span_reader
+    {
+    public:
+        span_reader(xtl::span<const uint8_t> span)
+            : span_(span)
+        {
+        }
+
+        bool empty() const noexcept { return span_.empty(); }
+
+        template <class T>
+        T read()
+        {
+            auto value = *reinterpret_cast<const T *>(span_.data());
+            advance(sizeof(T));
+            return value;
+        }
+
+        template <class T>
+        void read(T &value)
+        {
+            value = *reinterpret_cast<const T *>(span_.data());
+            advance(sizeof(T));
+        }
+
+        template <class T>
+        void read_span(xtl::span<const T> &span, size_t size)
+        {
+            span = { reinterpret_cast<const T *>(span_.data()), size };
+            advance(sizeof(T) * size);
+        }
+
+        template <class T, ptrdiff_t N>
+        void read_span(xtl::span<const T, N> &span)
+        {
+            span = { reinterpret_cast<const T *>(span_.data()), N };
+            advance(sizeof(T) * N);
+        }
+
+        template <class T>
+        const T *peek() const noexcept
+        {
+            return reinterpret_cast<const T *>(span_.data());
+        }
+
+        template <class T>
+        void get_array(const T *&value, size_t size)
+        {
+            value = peek<T>();
+            advance(size * sizeof(T));
+        }
+
+        template <class T>
+        void get_ref(const T *&value)
+        {
+            value = peek<T>();
+            advance(sizeof(T));
+        }
+
+        void skip(size_t count)
+        {
+            advance(count);
+        }
+
+    private:
+        void advance(size_t count)
+        {
+            span_ = span_.subspan(count);
+        }
+
+    private:
+        xtl::span<const uint8_t> span_;
+    };
+}
+}
--- a/lib/nncase/include/runtime/target_config.h
+++ b/lib/nncase/include/runtime/target_config.h
@ -0,0 +1,15 @@
+#pragma once
+
+#define NNCASE_CONCAT_3(a, b, c) a/b/c
+#define NNCASE_TARGET_HEADER_(target, name) <NNCASE_CONCAT_3(targets, target, name)>
+#define NNCASE_TARGET_HEADER(name) NNCASE_TARGET_HEADER_(NNCASE_TARGET, name)
+
+#include NNCASE_TARGET_HEADER(interpreter.h)
+
+namespace nncase
+{
+namespace runtime
+{
+    using interpreter_t = nncase::targets::NNCASE_TARGET::interpreter;
+}
+}
--- a/lib/nncase/include/runtime_op_utility.h
+++ b/lib/nncase/include/runtime_op_utility.h
@ -0,0 +1,70 @@
+#pragma once
+#include <cassert>
+#include <datatypes.h>
+
+namespace nncase
+{
+namespace runtime
+{
+    inline size_t get_bytes(datatype_t type)
+    {
+        size_t element_size;
+
+        switch (type)
+        {
+        case dt_float32:
+            element_size = 4;
+            break;
+        case dt_uint8:
+            element_size = 1;
+            break;
+        default:
+            assert(!"Not supported data type");
+        }
+
+        return element_size;
+    }
+
+    template <int32_t Bits, class T>
+    uint8_t count_leading_zeros(T value)
+    {
+        uint8_t num_zeroes = 0;
+        for (int32_t i = Bits - 1; i >= 0; i--)
+        {
+            if ((value & (1ULL << i)) == 0)
+                ++num_zeroes;
+            else
+                break;
+        }
+
+        return num_zeroes;
+    }
+
+    template <class T>
+    T carry_shift(T value, uint8_t shift)
+    {
+        if (shift > 0)
+        {
+            value >>= shift - 1;
+            if (value & 0x1)
+            {
+                if (value < 0)
+                    value = (value >> 1) - 1;
+                else
+                    value = (value >> 1) + 1;
+            }
+            else
+            {
+                value >>= 1;
+            }
+        }
+
+        return value;
+    }
+
+    inline int32_t mul_and_carry_shift(int32_t value, int32_t mul, uint8_t shift)
+    {
+        return (int32_t)carry_shift((int64_t) value * mul, shift);
+    }
+}
+}
--- a/lib/nncase/include/targets/cpu/cpu_ops_body.h
+++ b/lib/nncase/include/targets/cpu/cpu_ops_body.h
@ -0,0 +1,193 @@
+#pragma once
+#include "../node_body.h"
+
+namespace nncase
+{
+namespace targets
+{
+    namespace cpu
+    {
+        struct cpu_conv2d_options
+        {
+            memory_range input;
+            memory_range output;
+            runtime_shape_t in_shape;
+            int32_t out_channels;
+            padding padding_h;
+            padding padding_w;
+            int32_t filter_h;
+            int32_t filter_w;
+            int32_t stride_h;
+            int32_t stride_w;
+            int32_t dilation_h;
+            int32_t dilation_w;
+            value_range<float> fused_activation;
+            xtl::span<const float> weights;
+            xtl::span<const float> bias;
+
+            void deserialize(runtime::span_reader &reader)
+            {
+                reader.read(input);
+                reader.read(output);
+                reader.read(in_shape);
+                reader.read(out_channels);
+                reader.read(padding_h);
+                reader.read(padding_w);
+                reader.read(filter_h);
+                reader.read(filter_w);
+                reader.read(stride_h);
+                reader.read(stride_w);
+                reader.read(dilation_h);
+                reader.read(dilation_w);
+                reader.read(fused_activation);
+                reader.read_span(weights, (size_t)out_channels * in_shape[3] * filter_h * filter_w);
+                reader.read_span(bias, out_channels);
+            }
+        };
+
+        struct cpu_depthwise_conv2d_options
+        {
+            memory_range input;
+            memory_range output;
+            runtime_shape_t in_shape;
+            padding padding_h;
+            padding padding_w;
+            int32_t filter_h;
+            int32_t filter_w;
+            int32_t stride_h;
+            int32_t stride_w;
+            int32_t dilation_h;
+            int32_t dilation_w;
+            value_range<float> fused_activation;
+            xtl::span<const float> weights;
+            xtl::span<const float> bias;
+
+            void deserialize(runtime::span_reader &reader)
+            {
+                reader.read(input);
+                reader.read(output);
+                reader.read(in_shape);
+                reader.read(padding_h);
+                reader.read(padding_w);
+                reader.read(filter_h);
+                reader.read(filter_w);
+                reader.read(stride_h);
+                reader.read(stride_w);
+                reader.read(dilation_h);
+                reader.read(dilation_w);
+                reader.read(fused_activation);
+                reader.read_span(weights, (size_t)in_shape[3] * filter_h * filter_w);
+                reader.read_span(bias, in_shape[3]);
+            }
+        };
+
+        struct cpu_reduce_window2d_options : simple_node_body<cpu_reduce_window2d_options>
+        {
+            memory_range input;
+            memory_range output;
+            reduce_op_t reduce_op;
+            runtime_shape_t in_shape;
+            padding padding_h;
+            padding padding_w;
+            int32_t filter_h;
+            int32_t filter_w;
+            int32_t stride_h;
+            int32_t stride_w;
+            int32_t dilation_h;
+            int32_t dilation_w;
+            float init_value;
+            value_range<float> fused_activation;
+        };
+
+        struct cpu_quantized_conv2d_options
+        {
+            memory_range input;
+            memory_range output;
+            runtime_shape_t in_shape;
+            int32_t out_channels;
+            padding padding_h;
+            padding padding_w;
+            int32_t filter_h;
+            int32_t filter_w;
+            int32_t stride_h;
+            int32_t stride_w;
+            int32_t dilation_h;
+            int32_t dilation_w;
+            int32_t input_offset;
+            int32_t filter_offset;
+            int32_t output_mul;
+            int32_t output_shift;
+            int32_t output_offset;
+            xtl::span<const uint8_t> weights;
+            xtl::span<const int32_t> bias;
+
+            void deserialize(runtime::span_reader &reader)
+            {
+                reader.read(input);
+                reader.read(output);
+                reader.read(in_shape);
+                reader.read(out_channels);
+                reader.read(padding_h);
+                reader.read(padding_w);
+                reader.read(filter_h);
+                reader.read(filter_w);
+                reader.read(stride_h);
+                reader.read(stride_w);
+                reader.read(dilation_h);
+                reader.read(dilation_w);
+                reader.read(input_offset);
+                reader.read(filter_offset);
+                reader.read(output_mul);
+                reader.read(output_shift);
+                reader.read(output_offset);
+                reader.read_span(weights, (size_t)out_channels * in_shape[3] * filter_h * filter_w);
+                reader.read_span(bias, out_channels);
+            }
+        };
+
+        struct cpu_quantized_depthwise_conv2d_options
+        {
+            memory_range input;
+            memory_range output;
+            runtime_shape_t in_shape;
+            padding padding_h;
+            padding padding_w;
+            int32_t filter_h;
+            int32_t filter_w;
+            int32_t stride_h;
+            int32_t stride_w;
+            int32_t dilation_h;
+            int32_t dilation_w;
+            int32_t input_offset;
+            int32_t filter_offset;
+            int32_t output_mul;
+            int32_t output_shift;
+            int32_t output_offset;
+            xtl::span<const uint8_t> weights;
+            xtl::span<const int32_t> bias;
+
+            void deserialize(runtime::span_reader &reader)
+            {
+                reader.read(input);
+                reader.read(output);
+                reader.read(in_shape);
+                reader.read(padding_h);
+                reader.read(padding_w);
+                reader.read(filter_h);
+                reader.read(filter_w);
+                reader.read(stride_h);
+                reader.read(stride_w);
+                reader.read(dilation_h);
+                reader.read(dilation_w);
+                reader.read(input_offset);
+                reader.read(filter_offset);
+                reader.read(output_mul);
+                reader.read(output_shift);
+                reader.read(output_offset);
+                reader.read_span(weights, (size_t)in_shape[3] * filter_h * filter_w);
+                reader.read_span(bias, in_shape[3]);
+            }
+        };
+    }
+}
+}
--- a/lib/nncase/include/targets/cpu/interpreter.h
+++ b/lib/nncase/include/targets/cpu/interpreter.h
@ -0,0 +1,17 @@
+#pragma once
+#include <runtime/interpreter.h>
+
+namespace nncase
+{
+namespace targets
+{
+    namespace cpu
+    {
+        class interpreter : public runtime::interpreter_base
+        {
+        public:
+            using interpreter_base::interpreter_base;
+        };
+    }
+}
+}
--- a/lib/nncase/include/targets/k210/interpreter.h
+++ b/lib/nncase/include/targets/k210/interpreter.h
@ -0,0 +1,44 @@
+#pragma once
+#include "k210_sim_types.h"
+#include <runtime/interpreter.h>
+
+namespace nncase
+{
+namespace targets
+{
+    namespace k210
+    {
+        struct k210_interpreter_context
+        {
+            runtime::interpreter_base *interpreter;
+            runtime::interpreter_step_t step;
+        };
+
+        class interpreter : public runtime::interpreter_base
+        {
+        public:
+            using interpreter_base::memory_at;
+
+            interpreter();
+
+#if !NNCASE_TARGET_K210_SIMULATOR
+
+            dmac_channel_number_t dma_ch() const noexcept { return dma_ch_; }
+            void dma_ch(dmac_channel_number_t dma_ch) noexcept { dma_ch_ = dma_ch; }
+            k210_interpreter_context &context() noexcept { return context_; }
+#endif
+
+        protected:
+            xtl::span<uint8_t> memory_at(const memory_range &range) const noexcept override;
+
+        private:
+#if NNCASE_TARGET_K210_SIMULATOR
+            std::unique_ptr<uint8_t[]> kpu_mem_;
+#else
+            dmac_channel_number_t dma_ch_;
+            k210_interpreter_context context_;
+#endif
+        };
+    }
+}
+}
--- a/lib/nncase/include/targets/k210/k210_ops_body.h
+++ b/lib/nncase/include/targets/k210/k210_ops_body.h
@ -0,0 +1,58 @@
+#pragma once
+#include "../node_body.h"
+#include "k210_runtime_op_utility.h"
+#include "k210_sim_types.h"
+
+namespace nncase
+{
+namespace targets
+{
+    namespace k210
+    {
+        struct kpu_upload_options : simple_node_body<kpu_upload_options>
+        {
+            memory_range input;
+            memory_range output;
+            runtime_shape_t in_shape;
+        };
+
+        struct kpu_conv2d_options
+        {
+            memory_range main_mem_output;
+            int32_t batches;
+            int32_t reserved0;
+            kpu_layer_argument_t layer;
+            xtl::span<const kpu_batchnorm_argument_t> batch_norm;
+            const kpu_activate_table_t *activation;
+            xtl::span<const uint8_t> weights;
+
+            void deserialize(runtime::span_reader &reader)
+            {
+                reader.read(main_mem_output);
+                reader.read(batches);
+                reader.read(reserved0);
+                reader.read(layer);
+
+                auto ic = layer.image_channel_num.data.i_ch_num + 1;
+                auto oc = layer.image_channel_num.data.o_ch_num + 1;
+                auto filter = get_kpu_filter_size((kpu_filter_type_t)layer.kernel_pool_type_cfg.data.kernel_type);
+                auto weights_size = layer.interrupt_enabe.data.depth_wise_layer
+                    ? oc * filter * filter
+                    : ic * oc * filter * filter;
+
+                reader.skip(layer.kernel_pool_type_cfg.data.bwsx_base_addr);
+                reader.read_span(batch_norm, oc);
+                reader.skip(layer.kernel_calc_type_cfg.data.active_addr);
+                reader.get_ref(activation);
+                reader.skip(layer.kernel_load_cfg.data.para_start_addr);
+                reader.read_span(weights, weights_size);
+#if !NNCASE_TARGET_K210_SIMULATOR
+                layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)batch_norm.data();
+                layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)activation;
+                layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)weights.data();
+#endif
+            }
+        };
+    }
+}
+}
--- a/lib/nncase/include/targets/k210/k210_runtime_op_utility.h
+++ b/lib/nncase/include/targets/k210/k210_runtime_op_utility.h
@ -0,0 +1,134 @@
+#pragma once
+#include "k210_sim_types.h"
+
+namespace nncase
+{
+namespace targets
+{
+    namespace k210
+    {
+        struct kpu_layout
+        {
+            int32_t groups;
+            int32_t row_len;
+            int32_t row_pitch;
+        };
+
+        inline kpu_layout get_kpu_row_layout(int32_t width)
+        {
+            kpu_layout layout;
+
+            if (width <= 16)
+            {
+                layout.groups = 4;
+                layout.row_len = 1;
+                layout.row_pitch = 16;
+            }
+            else if (width <= 32)
+            {
+                layout.groups = 2;
+                layout.row_len = 1;
+                layout.row_pitch = 32;
+            }
+            else
+            {
+                layout.groups = 1;
+                layout.row_len = (width + 63) / 64;
+                layout.row_pitch = 64;
+            }
+
+            return layout;
+        }
+
+        inline int32_t get_kpu_filter_size(kpu_filter_type_t filter)
+        {
+            switch (filter)
+            {
+            case kpu_filter_1x1:
+                return 1;
+            case kpu_filter_3x3:
+                return 3;
+            default:
+                return 0;
+            }
+        }
+
+        inline int get_kpu_rows(int32_t width, int32_t height, int32_t channels)
+        {
+            auto layout = get_kpu_row_layout(width);
+            auto one_line_channels = std::min(channels, layout.groups);
+            auto blocks = (channels + one_line_channels - 1) / one_line_channels;
+            auto size = layout.row_len * height * blocks;
+            return size;
+        }
+
+        inline int get_kpu_bytes(int32_t width, int32_t height, int32_t channels)
+        {
+            return get_kpu_rows(width, height, channels) * 64;
+        }
+
+#if NNCASE_TARGET_K210_SIMULATOR
+
+        inline int32_t get_kpu_filter_size(kpu_pool_type_t filter)
+        {
+            switch (filter)
+            {
+            case kpu_pool_bypass:
+                return 1;
+            case kpu_pool_max_2_s2:
+            case kpu_pool_mean_2_s2:
+            case kpu_pool_left_top_2_s2:
+            case kpu_pool_right_top_2_s2:
+            case kpu_pool_max_2_s1:
+            case kpu_pool_mean_2_s1:
+                return 2;
+            case kpu_pool_max_4_s4:
+            case kpu_pool_mean_4_s4:
+            case kpu_pool_left_top_4_s4:
+                return 4;
+            }
+        }
+
+        inline int32_t get_kpu_filter_stride(kpu_pool_type_t filter)
+        {
+            switch (filter)
+            {
+            case kpu_pool_bypass:
+                return 1;
+            case kpu_pool_max_2_s2:
+            case kpu_pool_mean_2_s2:
+            case kpu_pool_left_top_2_s2:
+            case kpu_pool_right_top_2_s2:
+                return 2;
+            case kpu_pool_max_2_s1:
+            case kpu_pool_mean_2_s1:
+                return 1;
+            case kpu_pool_max_4_s4:
+            case kpu_pool_mean_4_s4:
+            case kpu_pool_left_top_4_s4:
+                return 4;
+            }
+        }
+
+        inline int32_t get_kpu_pool_output_size(int32_t input, kpu_pool_type_t pool_type)
+        {
+            return input / get_kpu_filter_stride(pool_type);
+        }
+
+        inline std::array<int32_t, 2> get_kpu_select_pool_offset(kpu_pool_type_t pool_type)
+        {
+            switch (pool_type)
+            {
+            case kpu_pool_left_top_2_s2:
+                return { 0, 0 };
+            case kpu_pool_right_top_2_s2:
+                return { 0, 1 };
+            case kpu_pool_left_top_4_s4:
+                return { 0, 0 };
+            }
+        }
+
+#endif
+    }
+}
+}
--- a/lib/nncase/include/targets/k210/k210_sim_types.h
+++ b/lib/nncase/include/targets/k210/k210_sim_types.h
@ -0,0 +1,249 @@
+#pragma once
+#include <array>
+#include <cstdint>
+
+#ifdef __riscv64
+#define NNCASE_TARGET_K210_SIMULATOR 0
+#include <kpu.h>
+#else
+#define NNCASE_TARGET_K210_SIMULATOR 1
+#endif
+
+namespace nncase
+{
+namespace targets
+{
+    namespace k210
+    {
+#if NNCASE_TARGET_K210_SIMULATOR
+        typedef struct
+        {
+            union {
+                uint64_t reg;
+                struct
+                {
+                    uint64_t int_en : 1;
+                    uint64_t ram_flag : 1;
+                    uint64_t full_add : 1;
+                    uint64_t depth_wise_layer : 1;
+                    uint64_t reserved : 60;
+                } data;
+            } interrupt_enabe;
+
+            union {
+                uint64_t reg;
+                struct
+                {
+                    uint64_t image_src_addr : 15;
+                    uint64_t reserved0 : 17;
+                    uint64_t image_dst_addr : 15;
+                    uint64_t reserved1 : 17;
+                } data;
+            } image_addr;
+
+            union {
+                uint64_t reg;
+                struct
+                {
+                    uint64_t i_ch_num : 10;
+                    uint64_t reserved0 : 22;
+                    uint64_t o_ch_num : 10;
+                    uint64_t reserved1 : 6;
+                    uint64_t o_ch_num_coef : 10;
+                    uint64_t reserved2 : 6;
+                } data;
+            } image_channel_num;
+
+            union {
+                uint64_t reg;
+                struct
+                {
+                    uint64_t i_row_wid : 10;
+                    uint64_t i_col_high : 9;
+                    uint64_t reserved0 : 13;
+                    uint64_t o_row_wid : 10;
+                    uint64_t o_col_high : 9;
+                    uint64_t reserved1 : 13;
+                } data;
+            } image_size;
+
+            union {
+                uint64_t reg;
+                struct
+                {
+                    uint64_t kernel_type : 3;
+                    uint64_t pad_type : 1;
+                    uint64_t pool_type : 4;
+                    uint64_t first_stride : 1;
+                    uint64_t bypass_conv : 1;
+                    uint64_t load_para : 1;
+                    uint64_t reserved0 : 5;
+                    uint64_t dma_burst_size : 8;
+                    uint64_t pad_value : 8;
+                    uint64_t bwsx_base_addr : 32;
+                } data;
+            } kernel_pool_type_cfg;
+
+            union {
+                uint64_t reg;
+                struct
+                {
+                    uint64_t load_coor : 1;
+                    uint64_t load_time : 6;
+                    uint64_t reserved0 : 8;
+                    uint64_t para_size : 17;
+                    uint64_t para_start_addr : 32;
+                } data;
+            } kernel_load_cfg;
+
+            union {
+                uint64_t reg;
+                struct
+                {
+                    uint64_t coef_column_offset : 4;
+                    uint64_t coef_row_offset : 12;
+                    uint64_t reserved0 : 48;
+                } data;
+            } kernel_offset;
+
+            union {
+                uint64_t reg;
+                struct
+                {
+                    uint64_t channel_switch_addr : 15;
+                    uint64_t reserved : 1;
+                    uint64_t row_switch_addr : 4;
+                    uint64_t coef_size : 8;
+                    uint64_t coef_group : 3;
+                    uint64_t load_act : 1;
+                    uint64_t active_addr : 32;
+                } data;
+            } kernel_calc_type_cfg;
+
+            union {
+                uint64_t reg;
+                struct
+                {
+                    uint64_t wb_channel_switch_addr : 15;
+                    uint64_t reserved0 : 1;
+                    uint64_t wb_row_switch_addr : 4;
+                    uint64_t wb_group : 3;
+                    uint64_t reserved1 : 41;
+                } data;
+            } write_back_cfg;
+
+            union {
+                uint64_t reg;
+                struct
+                {
+                    uint64_t shr_w : 4;
+                    uint64_t shr_x : 4;
+                    uint64_t arg_w : 24;
+                    uint64_t arg_x : 24;
+                    uint64_t reserved0 : 8;
+                } data;
+            } conv_value;
+
+            union {
+                uint64_t reg;
+                struct
+                {
+                    uint64_t arg_add : 40;
+                    uint64_t reserved : 24;
+                } data;
+            } conv_value2;
+
+            union {
+                uint64_t reg;
+                struct
+                {
+                    uint64_t send_data_out : 1;
+                    uint64_t reserved : 15;
+                    uint64_t channel_byte_num : 16;
+                    uint64_t dma_total_byte : 32;
+                } data;
+            } dma_parameter;
+        } kpu_layer_argument_t;
+
+        typedef struct
+        {
+            union {
+                uint64_t reg;
+                struct
+                {
+                    uint64_t shift_number : 8;
+                    uint64_t y_mul : 16;
+                    uint64_t x_start : 36;
+                } data;
+            } activate_para[16];
+
+            union {
+                uint64_t reg;
+                struct
+                {
+                    uint8_t result_bias[8];
+                } data;
+            } activate_para_bias0;
+
+            union {
+                uint64_t reg;
+                struct
+                {
+                    uint8_t result_bias[8];
+                } data;
+            } activate_para_bias1;
+        } kpu_activate_table_t;
+#endif
+
+        typedef struct
+        {
+            union {
+                uint64_t reg;
+                struct
+                {
+                    uint64_t norm_mul : 24;
+                    uint64_t norm_add : 32;
+                    uint64_t norm_shift : 4;
+                } data;
+            } batchnorm;
+        } kpu_batchnorm_argument_t;
+
+        typedef enum _kpu_filter_type
+        {
+            kpu_filter_1x1 = 0,
+            kpu_filter_3x3 = 1
+        } kpu_filter_type_t;
+
+        typedef enum _kpu_pool_type
+        {
+            kpu_pool_bypass = 0,
+            kpu_pool_max_2_s2 = 1,
+            kpu_pool_mean_2_s2 = 2,
+            kpu_pool_max_4_s4 = 3,
+            kpu_pool_mean_4_s4 = 4,
+            kpu_pool_left_top_2_s2 = 5,
+            kpu_pool_right_top_2_s2 = 6,
+            kpu_pool_left_top_4_s4 = 7,
+            kpu_pool_mean_2_s1 = 8,
+            kpu_pool_max_2_s1 = 9
+        } kpu_pool_type_t;
+
+        struct kpu_batchnorm_segment
+        {
+            int32_t mul;
+            int32_t shift;
+            int32_t add;
+        };
+
+        struct kpu_activation_segment
+        {
+            int64_t start_x;
+            int32_t mul;
+            int32_t shift;
+            int32_t add;
+        };
+
+        using kpu_activation_table_t = std::array<kpu_activation_segment, 16>;
+    }
+}
+}
--- a/lib/nncase/include/targets/neutral/neutral_ops_body.h
+++ b/lib/nncase/include/targets/neutral/neutral_ops_body.h
@ -0,0 +1,258 @@
+#pragma once
+#include "../node_body.h"
+
+namespace nncase
+{
+namespace targets
+{
+    namespace neutral
+    {
+        struct binary_options : public simple_node_body<binary_options>
+        {
+            memory_range input_a;
+            memory_range input_b;
+            memory_range output;
+            binary_op_t binary_op;
+            runtime_shape_t in_a_shape;
+            runtime_shape_t in_b_shape;
+            runtime_shape_t out_shape;
+            value_range<float> fused_activation;
+        };
+
+        struct concat_options
+        {
+            memory_range output;
+            uint32_t inner_size;
+            uint32_t outer_size;
+            uint32_t inputs_count;
+            xtl::span<const memory_range> inputs;
+            xtl::span<const int32_t> dims;
+
+            void deserialize(runtime::span_reader &reader)
+            {
+                reader.read(output);
+                reader.read(inner_size);
+                reader.read(outer_size);
+                reader.read(inputs_count);
+                reader.read_span(inputs, inputs_count);
+                reader.read_span(dims, inputs_count);
+            }
+
+            void serialize(runtime::binary_writer &writer) const
+            {
+                writer.write(output);
+                writer.write(inner_size);
+                writer.write(outer_size);
+                writer.write(inputs_count);
+                writer.write_array(inputs);
+                writer.write_array(dims);
+            }
+        };
+
+        struct conv2d_options
+        {
+            memory_range input;
+            memory_range output;
+            runtime_shape_t in_shape;
+            int32_t groups;
+            int32_t out_channels;
+            padding padding_h;
+            padding padding_w;
+            int32_t filter_h;
+            int32_t filter_w;
+            int32_t stride_h;
+            int32_t stride_w;
+            int32_t dilation_h;
+            int32_t dilation_w;
+            value_range<float> fused_activation;
+            xtl::span<const float> weights;
+            xtl::span<const float> bias;
+
+            void deserialize(runtime::span_reader &reader)
+            {
+                reader.read(input);
+                reader.read(output);
+                reader.read(in_shape);
+                reader.read(groups);
+                reader.read(out_channels);
+                reader.read(padding_h);
+                reader.read(padding_w);
+                reader.read(filter_h);
+                reader.read(filter_w);
+                reader.read(stride_h);
+                reader.read(stride_w);
+                reader.read(dilation_h);
+                reader.read(dilation_w);
+                reader.read(fused_activation);
+                reader.read_span(weights, (size_t)out_channels * in_shape[1] / groups * filter_h * filter_w);
+                reader.read_span(bias, out_channels);
+            }
+
+            void serialize(runtime::binary_writer &writer) const
+            {
+                writer.write(input);
+                writer.write(output);
+                writer.write(in_shape);
+                writer.write(groups);
+                writer.write(out_channels);
+                writer.write(padding_h);
+                writer.write(padding_w);
+                writer.write(filter_h);
+                writer.write(filter_w);
+                writer.write(stride_h);
+                writer.write(stride_w);
+                writer.write(dilation_h);
+                writer.write(dilation_w);
+                writer.write(fused_activation);
+                writer.write_array(weights);
+                writer.write_array(bias);
+            }
+        };
+
+        struct dequantize_options : public simple_node_body<dequantize_options>
+        {
+            memory_range input;
+            memory_range output;
+            quant_param_t quant_param;
+        };
+
+        struct matmul_options
+        {
+            memory_range input_a;
+            memory_range input_b;
+            memory_range output;
+            int32_t a_rows;
+            int32_t a_cols;
+            int32_t b_cols;
+            value_range<float> fused_activation;
+            xtl::span<const float> bias;
+
+            void deserialize(runtime::span_reader &reader)
+            {
+                reader.read(input_a);
+                reader.read(input_b);
+                reader.read(output);
+                reader.read(a_rows);
+                reader.read(a_cols);
+                reader.read(b_cols);
+                reader.read(fused_activation);
+                reader.read_span(bias, b_cols);
+            }
+
+            void serialize(runtime::binary_writer &writer) const
+            {
+                writer.write(input_a);
+                writer.write(input_b);
+                writer.write(output);
+                writer.write(a_rows);
+                writer.write(a_cols);
+                writer.write(b_cols);
+                writer.write(fused_activation);
+                writer.write_array(bias);
+            }
+        };
+
+        struct memory_copy_options : public simple_node_body<memory_copy_options>
+        {
+            memory_range input;
+            memory_range output;
+        };
+
+        struct pad_options : public simple_node_body<pad_options>
+        {
+            memory_range input;
+            memory_range output;
+            runtime_shape_t in_shape;
+            runtime_paddings_t paddings;
+            scalar pad_value;
+        };
+
+        struct quantize_options : public simple_node_body<quantize_options>
+        {
+            memory_range input;
+            memory_range output;
+            quant_param_t quant_param;
+        };
+
+        struct reduce_options : public simple_node_body<reduce_options>
+        {
+            memory_range input;
+            memory_range output;
+            reduce_op_t reduce_op;
+            runtime_shape_t in_shape;
+            runtime_shape_t out_shape;
+            float init_value;
+        };
+
+        struct reduce_window2d_options : simple_node_body<reduce_window2d_options>
+        {
+            memory_range input;
+            memory_range output;
+            reduce_op_t reduce_op;
+            runtime_shape_t in_shape;
+            padding padding_h;
+            padding padding_w;
+            int32_t filter_h;
+            int32_t filter_w;
+            int32_t stride_h;
+            int32_t stride_w;
+            int32_t dilation_h;
+            int32_t dilation_w;
+            float init_value;
+            value_range<float> fused_activation;
+        };
+
+        struct resize_bilinear_options : public simple_node_body<resize_bilinear_options>
+        {
+            memory_range input;
+            memory_range output;
+            runtime_shape_t in_shape;
+            int32_t out_h;
+            int32_t out_w;
+            bool align_corners;
+        };
+
+        struct resize_nearest_neighbor_options : public simple_node_body<resize_nearest_neighbor_options>
+        {
+            memory_range input;
+            memory_range output;
+            runtime_shape_t in_shape;
+            int32_t out_h;
+            int32_t out_w;
+            bool align_corners;
+        };
+
+        struct softmax_options : public simple_node_body<softmax_options>
+        {
+            memory_range input;
+            memory_range output;
+            int32_t inner_size;
+            int32_t outer_size;
+            float beta;
+        };
+
+        struct transpose_options : public simple_node_body<transpose_options>
+        {
+            memory_range input;
+            memory_range output;
+            runtime_shape_t in_shape;
+            runtime_shape_t perm;
+        };
+
+        struct strided_slice_options : public simple_node_body<strided_slice_options>
+        {
+            memory_range input;
+            memory_range output;
+            runtime_shape_t in_shape;
+            runtime_shape_t begin;
+            runtime_shape_t end;
+            runtime_shape_t strides;
+            int32_t begin_mask;
+            int32_t end_mask;
+            int32_t ellipsis_mask;
+            int32_t new_axis_mask;
+            int32_t shrink_axis_mask;
+        };
+    }
+}
+}
--- a/lib/nncase/include/targets/node_body.h
+++ b/lib/nncase/include/targets/node_body.h
@ -0,0 +1,24 @@
+#pragma once
+#include "../runtime/binary_writer.h"
+#include "../runtime/span_reader.h"
+#include <datatypes.h>
+
+namespace nncase
+{
+namespace targets
+{
+    template <class T>
+    struct simple_node_body
+    {
+        void deserialize(runtime::span_reader &reader)
+        {
+            reader.read(static_cast<T &>(*this));
+        }
+
+        void serialize(runtime::binary_writer &writer) const
+        {
+            writer.write(static_cast<const T &>(*this));
+        }
+    };
+}
+}
--- a/lib/nncase/nncase.cpp
+++ b/lib/nncase/nncase.cpp
@ -0,0 +1,116 @@
+/* Copyright 2018 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <nncase.h>
+#include <runtime/target_config.h>
+#include <stdio.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+
+class nncase_context
+{
+public:
+    int load_kmodel(const uint8_t *buffer)
+    {
+        return interpreter_.try_load_model(buffer) ? 0 : -1;
+    }
+
+    int get_output(uint32_t index, uint8_t **data, size_t *size)
+    {
+        if (index >= interpreter_.outputs_size())
+            return -1;
+
+        auto mem = interpreter_.memory_at<uint8_t>(interpreter_.output_at(index));
+        *data = mem.data();
+        *size = mem.size();
+        return 0;
+    }
+
+    int run_kmodel(const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
+    {
+        done_callback_ = done_callback;
+        userdata_ = userdata;
+        interpreter_.dma_ch(dma_ch);
+
+        auto input = interpreter_.input_at(0);
+        auto mem = interpreter_.memory_at<uint8_t>(input);
+        std::copy(src, src + mem.size(), mem.begin());
+        interpreter_.run(done_thunk, on_error_thunk, node_profile_thunk, this);
+        return 0;
+    }
+
+private:
+    void on_done()
+    {
+        printf("Total: %fms\n", interpreter_.total_duration().count() / 1e6);
+
+        if (done_callback_)
+            done_callback_(userdata_);
+    }
+
+    static void done_thunk(void *userdata)
+    {
+        reinterpret_cast<nncase_context *>(userdata)->on_done();
+    }
+
+    static void on_error_thunk(const char *err, void *userdata)
+    {
+        printf("Fatal: %s\n", err);
+    }
+
+    static void node_profile_thunk(runtime_opcode op, std::chrono::nanoseconds duration, void *userdata)
+    {
+        printf("%s: %fms\n", node_opcode_names(op).data(), duration.count() / 1e6);
+    }
+
+private:
+    interpreter_t interpreter_;
+    kpu_done_callback_t done_callback_;
+    void *userdata_;
+};
+
+int nncase_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
+{
+    auto nnctx = new (std::nothrow) nncase_context();
+    if (ctx)
+    {
+        ctx->is_nncase = 1;
+        ctx->nncase_ctx = nnctx;
+        return nnctx->load_kmodel(buffer);
+    }
+    else
+    {
+        return -1;
+    }
+}
+
+int nncase_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
+{
+    auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
+    return nnctx->get_output(index, data, size);
+}
+
+void nncase_model_free(kpu_model_context_t *ctx)
+{
+    auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
+    delete nnctx;
+    ctx->nncase_ctx = nullptr;
+}
+
+int nncase_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
+{
+    auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
+    return nnctx->run_kmodel(src, dma_ch, done_callback, userdata);
+}
--- a/lib/nncase/runtime/interpreter.cpp
+++ b/lib/nncase/runtime/interpreter.cpp
@ -0,0 +1,131 @@
+#include <cassert>
+#include <iostream>
+#include <runtime/interpreter.h>
+#include <runtime/kernel_registry.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+
+bool interpreter_base::try_load_model(const uint8_t *buffer)
+{
+    auto offset = buffer;
+    model_header_ = reinterpret_cast<const model_header *>(buffer);
+
+    // Validate model
+    if (model_header_->identifier != MODEL_IDENTIFIER || model_header_->version != MODEL_VERSION || (model_header_->target != MODEL_TARGET_CPU && model_header_->target != MODEL_TARGET_K210))
+        return false;
+
+    // Allocate buffers
+    main_mem_.reset(new (std::nothrow) uint8_t[model_header_->main_mem]);
+    if (!main_mem_)
+        return false;
+
+    offset += sizeof(model_header);
+    inputs_ = { reinterpret_cast<const memory_range *>(offset), inputs_size() };
+    offset += sizeof(memory_range) * inputs_size();
+    input_shapes_ = { reinterpret_cast<const runtime_shape_t *>(offset), inputs_size() };
+    offset += sizeof(runtime_shape_t) * inputs_size();
+    outputs_ = { reinterpret_cast<const memory_range *>(offset), outputs_size() };
+    offset += sizeof(memory_range) * outputs_size();
+    constants_ = { offset, model_header_->constants };
+    offset += constants_.size();
+    node_headers_ = { reinterpret_cast<const node_header *>(offset), nodes_size() };
+    offset += sizeof(node_header) * nodes_size();
+    node_body_start_ = offset;
+
+    return initialize();
+}
+
+bool interpreter_base::initialize()
+{
+    return true;
+}
+
+void interpreter_base::run(run_callback_t callback, error_callback_t on_error, node_profile_callback_t node_profile, void *userdata)
+{
+    run_callback_ = callback;
+    on_error_ = on_error;
+    node_profile_ = node_profile;
+    userdata_ = userdata;
+    cnt_node_ = 0;
+    cnt_node_body_ = node_body_start_;
+    total_duration_ = {};
+    last_time_.reset();
+    step();
+}
+
+void interpreter_base::step()
+{
+    auto result = kcr_done;
+
+    while (result == kcr_done)
+    {
+        if (!last_time_)
+        {
+            last_time_ = clock_t::now();
+        }
+        else
+        {
+            auto now = clock_t::now();
+            auto duration = now - *last_time_;
+            total_duration_ += duration;
+            last_time_ = now;
+
+            if (node_profile_)
+                node_profile_(last_op_, duration, userdata_);
+        }
+
+        if (cnt_node_ == nodes_size())
+        {
+            run_callback_(userdata_);
+            break;
+        }
+        else
+        {
+            auto node_id = cnt_node_++;
+            auto header = node_headers_[node_id];
+            xtl::span<const uint8_t> body(cnt_node_body_, header.body_size);
+            cnt_node_body_ += header.body_size;
+            last_op_ = header.opcode;
+
+            result = call_kernel(header.opcode, body, static_cast<interpreter_t &>(*this), &interpreter_base::step);
+
+            if (result == kcr_error)
+            {
+                if (on_error_)
+                {
+                    char buffer[256];
+                    auto name = node_opcode_names(header.opcode);
+                    if (!name.empty())
+                        std::sprintf(buffer, "error occurs in running kernel: %s", name.data());
+                    else
+                        std::sprintf(buffer, "Unknown opcode: (%d)", header.opcode);
+                    on_error_(buffer, userdata_);
+                }
+
+                break;
+            }
+        }
+    }
+}
+
+xtl::span<uint8_t> interpreter_base::memory_at(const memory_range &range) const noexcept
+{
+    uintptr_t base;
+
+    switch (range.memory_type)
+    {
+    case mem_const:
+        base = (uintptr_t)constants_.data();
+        break;
+    case mem_main:
+        base = (uintptr_t)main_mem_.get();
+        break;
+    default:
+        base = 0;
+        assert(!"Invalid memory type");
+        break;
+    }
+
+    return { reinterpret_cast<uint8_t *>(base + range.start), range.size };
+}
--- a/lib/nncase/runtime/kernel_registry.cpp
+++ b/lib/nncase/runtime/kernel_registry.cpp
@ -0,0 +1,55 @@
+#include <runtime/kernel_registry.h>
+#include <runtime/span_reader.h>
+#include <targets/cpu/cpu_ops_body.h>
+#include <targets/k210/k210_ops_body.h>
+#include <targets/neutral/neutral_ops_body.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+
+namespace nncase
+{
+namespace targets
+{
+#define BEGINE_DEFINE_TARGET(target) \
+    namespace target                 \
+    {
+
+#define DEFINE_RUNTIME_OP(target, id, name, value) \
+    kernel_call_result id(id##_options &, interpreter_t &, interpreter_step_t);
+
+#define END_DEFINE_TARGET() }
+
+#include <runtime/runtime_op.def>
+
+#undef BEGINE_DEFINE_TARGET
+#undef DEFINE_RUNTIME_OP
+#undef END_DEFINE_TARGET
+}
+}
+
+kernel_call_result runtime::call_kernel(runtime_opcode opcode, xtl::span<const uint8_t> body, interpreter_t &interpreter, interpreter_step_t step)
+{
+    span_reader reader(body);
+
+    switch (opcode)
+    {
+#define BEGINE_DEFINE_TARGET(...)
+#define DEFINE_RUNTIME_OP(target, id, name, value)                      \
+    case rop_##id:                                                      \
+    {                                                                   \
+        nncase::targets::target::id##_options options;                  \
+        options.deserialize(reader);                                    \
+        return nncase::targets::target::id(options, interpreter, step); \
+    }
+#define END_DEFINE_TARGET()
+
+#include <runtime/runtime_op.def>
+
+#undef BEGINE_DEFINE_TARGET
+#undef DEFINE_RUNTIME_OP
+#undef END_DEFINE_TARGET
+    default:
+        return kcr_error;
+    }
+}
--- a/lib/nncase/targets/cpu/cpu_ops.cpp
+++ b/lib/nncase/targets/cpu/cpu_ops.cpp
@ -0,0 +1,79 @@
+#include <kernels/cpu/cpu_kernels.h>
+#include <runtime/kernel_registry.h>
+#include <targets/cpu/cpu_ops_body.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+
+namespace nncase
+{
+namespace targets
+{
+    namespace cpu
+    {
+        kernel_call_result cpu_conv2d(cpu_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<float>(options.input);
+            auto output = interpreter.memory_at<float>(options.output);
+            kernels::cpu::conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.out_channels, options.filter_h,
+                options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation);
+            return kcr_done;
+        }
+
+        kernel_call_result cpu_depthwise_conv2d(cpu_depthwise_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<float>(options.input);
+            auto output = interpreter.memory_at<float>(options.output);
+            kernels::cpu::depthwise_conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.filter_h,
+                options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation);
+            return kcr_done;
+        }
+
+        runtime::kernel_call_result cpu_reduce_window2d(cpu_reduce_window2d_options &options, interpreter_t &interpreter, runtime::interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<float>(options.input);
+            auto output = interpreter.memory_at<float>(options.output);
+
+            auto reduce = [&](auto binary_op, auto window_op) {
+                kernels::cpu::reduce_window2d(input.data(), output.data(), options.init_value, options.in_shape, options.filter_h, options.filter_w, options.stride_h,
+                    options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation, binary_op, window_op);
+            };
+
+            switch (options.reduce_op)
+            {
+            case reduce_mean:
+                reduce([](auto a, auto b) { return a + b; }, [](auto v, auto k) { return v / k; });
+                return runtime::kcr_done;
+            case reduce_min:
+                reduce([](auto a, auto b) { return std::min(a, b); }, [](auto v, auto k) { return v; });
+                return runtime::kcr_done;
+            case reduce_max:
+                reduce([](auto a, auto b) { return std::max(a, b); }, [](auto v, auto k) { return v; });
+                return kcr_done;
+            default:
+                return kcr_error;
+            }
+        }
+
+        kernel_call_result cpu_quantized_conv2d(cpu_quantized_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<uint8_t>(options.input);
+            auto output = interpreter.memory_at<uint8_t>(options.output);
+            kernels::cpu::quantized_conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.out_channels, options.filter_h,
+                options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w,
+                options.input_offset, options.filter_offset, options.output_mul, options.output_shift, options.output_offset);
+            return kcr_done;
+        }
+
+        kernel_call_result cpu_quantized_depthwise_conv2d(cpu_quantized_depthwise_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<uint8_t>(options.input);
+            auto output = interpreter.memory_at<uint8_t>(options.output);
+            kernels::cpu::quantized_depthwise_conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.filter_h,
+                options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w,
+                options.input_offset, options.filter_offset, options.output_mul, options.output_shift, options.output_offset);
+            return kcr_done;
+        }
+    }
+}
+}
--- a/lib/nncase/targets/k210/interpreter.cpp
+++ b/lib/nncase/targets/k210/interpreter.cpp
@ -0,0 +1,36 @@
+#include <targets/k210/interpreter.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::targets::k210;
+
+interpreter::interpreter()
+#if NNCASE_TARGET_K210_SIMULATOR
+    : kpu_mem_(std::make_unique<uint8_t[]>(2 * 1024 * 1024))
+#endif
+{
+#if !NNCASE_TARGET_K210_SIMULATOR
+    kpu->interrupt_clear.reg = 7;
+    kpu->interrupt_mask.reg = 7;
+    kpu->fifo_threshold.reg = 10 | (1 << 4);
+    kpu->eight_bit_mode.reg = 1;
+
+    plic_set_priority(IRQN_AI_INTERRUPT, 1);
+#endif
+}
+
+xtl::span<uint8_t> interpreter::memory_at(const memory_range &range) const noexcept
+{
+    if (range.memory_type == mem_k210_kpu)
+    {
+        uintptr_t base =
+#if NNCASE_TARGET_K210_SIMULATOR
+            (uintptr_t)kpu_mem_.get();
+#else
+            (uintptr_t)AI_IO_BASE_ADDR;
+#endif
+        return { reinterpret_cast<uint8_t *>(base + range.start), range.size };
+    }
+
+    return interpreter_base::memory_at(range);
+}
--- a/lib/nncase/targets/k210/k210_ops.cpp
+++ b/lib/nncase/targets/k210/k210_ops.cpp
@ -0,0 +1,179 @@
+#include <kernels/k210/k210_kernels.h>
+#include <runtime/kernel_registry.h>
+#include <targets/k210/k210_ops_body.h>
+#if !NNCASE_TARGET_K210_SIMULATOR
+#include <dmac.h>
+#include <sysctl.h>
+#endif
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::targets::k210;
+
+namespace
+{
+#if !NNCASE_TARGET_K210_SIMULATOR
+void kpu_send_layer(const kpu_layer_argument_t &layer)
+{
+    kpu->layer_argument_fifo = layer.interrupt_enabe.reg;
+    kpu->layer_argument_fifo = layer.image_addr.reg;
+    kpu->layer_argument_fifo = layer.image_channel_num.reg;
+    kpu->layer_argument_fifo = layer.image_size.reg;
+    kpu->layer_argument_fifo = layer.kernel_pool_type_cfg.reg;
+    kpu->layer_argument_fifo = layer.kernel_load_cfg.reg;
+    kpu->layer_argument_fifo = layer.kernel_offset.reg;
+    kpu->layer_argument_fifo = layer.kernel_calc_type_cfg.reg;
+    kpu->layer_argument_fifo = layer.write_back_cfg.reg;
+    kpu->layer_argument_fifo = layer.conv_value.reg;
+    kpu->layer_argument_fifo = layer.conv_value2.reg;
+    kpu->layer_argument_fifo = layer.dma_parameter.reg;
+}
+
+void kpu_conv2d_normal(kpu_layer_argument_t &layer, plic_irq_callback_t callback, void *userdata)
+{
+    kpu->interrupt_clear.reg = 0b111;
+    kpu->interrupt_mask.reg = 0b110;
+    layer.interrupt_enabe.data.int_en = 1;
+    plic_irq_register(IRQN_AI_INTERRUPT, callback, userdata);
+    plic_irq_enable(IRQN_AI_INTERRUPT);
+    kpu_send_layer(layer);
+}
+
+void kpu_conv2d_output(kpu_layer_argument_t &layer, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata)
+{
+    kpu->interrupt_clear.reg = 0b111;
+    kpu->interrupt_mask.reg = 0b111;
+    layer.dma_parameter.data.send_data_out = 1;
+    sysctl_dma_select((sysctl_dma_channel_t)dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ);
+    dmac_set_irq(dma_ch, callback, userdata, 1);
+    dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
+        DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8);
+    kpu_send_layer(layer);
+}
+
+int kpu_plic_thunk(void *userdata)
+{
+    kpu->interrupt_clear.reg = 0b111;
+    kpu->interrupt_mask.reg = 0b111;
+
+    auto &ctx = *reinterpret_cast<k210_interpreter_context *>(userdata);
+    (ctx.interpreter->*ctx.step)();
+    return 0;
+}
+#endif
+}
+
+namespace nncase
+{
+namespace targets
+{
+    namespace k210
+    {
+        kernel_call_result kpu_upload(kpu_upload_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<uint8_t>(options.input);
+            auto output = interpreter.memory_at<uint8_t>(options.output);
+            kernels::k210::kpu_upload(input.data(), output.data(), options.in_shape);
+            return kcr_done;
+        }
+
+        kernel_call_result kpu_conv2d(kpu_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+#if NNCASE_TARGET_K210_SIMULATOR
+            auto input = interpreter.memory_at<uint8_t>({ mem_k210_kpu, dt_uint8, (uint32_t)options.layer.image_addr.data.image_src_addr * 64, 1 });
+            auto kpu_out = interpreter.memory_at<uint8_t>({ mem_k210_kpu, dt_uint8, (uint32_t)options.layer.image_addr.data.image_dst_addr * 64, 1 });
+
+            auto in_h = static_cast<int32_t>(options.layer.image_size.data.i_col_high + 1);
+            auto in_w = static_cast<int32_t>(options.layer.image_size.data.i_row_wid + 1);
+            auto in_ch = static_cast<int32_t>(options.layer.image_channel_num.data.i_ch_num + 1);
+            runtime_shape_t in_shape { options.batches, in_ch, in_h, in_w };
+            auto in_fmap_size = kernels::details::compute_size(in_shape);
+
+            auto out_h = static_cast<int32_t>(options.layer.image_size.data.o_col_high + 1);
+            auto out_w = static_cast<int32_t>(options.layer.image_size.data.o_row_wid + 1);
+            auto out_ch = static_cast<int32_t>(options.layer.image_channel_num.data.o_ch_num + 1);
+            runtime_shape_t conv_out_shape { options.batches, out_ch, in_h, in_w };
+            auto conv_out_fmap_size = kernels::details::compute_size(conv_out_shape);
+            runtime_shape_t out_shape { options.batches, out_ch, out_h, out_w };
+            auto out_fmap_size = kernels::details::compute_size(out_shape);
+
+            auto input_tmp = std::make_unique<uint8_t[]>(in_fmap_size);
+            auto workspace = std::make_unique<int64_t[]>(conv_out_fmap_size);
+            auto conv_output_tmp = std::make_unique<uint8_t[]>(conv_out_fmap_size);
+            auto output_tmp = std::make_unique<uint8_t[]>(out_fmap_size);
+
+            kernels::k210::kpu_download(input.data(), input_tmp.get(), in_shape);
+            auto is_depthwise = options.layer.interrupt_enabe.data.depth_wise_layer != 0;
+            auto filter_size = get_kpu_filter_size((kpu_filter_type_t)options.layer.kernel_pool_type_cfg.data.kernel_type);
+            auto pad_value = (uint8_t)options.layer.kernel_pool_type_cfg.data.pad_value;
+            auto arg_x = (int32_t)kernels::details::to_signed<24>(options.layer.conv_value.data.arg_x);
+            auto shift_x = (int32_t)options.layer.conv_value.data.shr_x;
+            auto arg_w = (int32_t)kernels::details::to_signed<24>(options.layer.conv_value.data.arg_w);
+            auto shift_w = (int32_t)options.layer.conv_value.data.shr_w;
+            auto arg_add = kernels::details::to_signed<40>(options.layer.conv_value2.data.arg_add);
+
+            auto batchnorm = std::make_unique<kpu_batchnorm_segment[]>(out_ch);
+            for (size_t i = 0; i < out_ch; i++)
+            {
+                auto &src = options.batch_norm[i].batchnorm.data;
+                auto &dest = batchnorm[i];
+                dest.mul = (int32_t)kernels::details::to_signed<24>(src.norm_mul);
+                dest.shift = (int32_t)src.norm_shift;
+                dest.add = (int32_t)kernels::details::to_signed<32>(src.norm_add);
+            }
+
+            kpu_activation_table_t activation;
+            for (size_t i = 0; i < 16; i++)
+            {
+                auto &src = options.activation->activate_para[i].data;
+                auto &dest = activation[i];
+                dest.start_x = kernels::details::to_signed<36>(src.x_start);
+                dest.mul = (int32_t)kernels::details::to_signed<16>(src.y_mul);
+                dest.shift = (int32_t)src.shift_number;
+
+                if (i < 16)
+                    dest.add = options.activation->activate_para_bias0.data.result_bias[i];
+                else
+                    dest.add = options.activation->activate_para_bias1.data.result_bias[i - 16];
+            }
+
+#define KPU_CONV2D_IMPL(is_depthwise_val, filter_size_val)                                                                                        \
+    if (is_depthwise == is_depthwise_val && filter_size == filter_size_val)                                                                       \
+    kernels::k210::kpu_conv2d<is_depthwise_val, filter_size_val>(input_tmp.get(), workspace.get(), conv_output_tmp.get(), options.weights.data(), \
+        in_h, in_w, in_ch, out_ch, pad_value, arg_x, shift_x, arg_w, shift_w, arg_add, batchnorm.get(), activation)
+
+            KPU_CONV2D_IMPL(true, 1);
+            else KPU_CONV2D_IMPL(true, 3);
+            else KPU_CONV2D_IMPL(false, 1);
+            else KPU_CONV2D_IMPL(false, 3);
+
+            kernels::k210::kpu_pool2d(conv_output_tmp.get(), output_tmp.get(), in_h, in_w, out_ch, (kpu_pool_type_t)options.layer.kernel_pool_type_cfg.data.pool_type);
+            kernels::k210::kpu_upload(output_tmp.get(), kpu_out.data(), out_shape);
+            if (options.main_mem_output.size)
+            {
+                auto main_output = interpreter.memory_at<uint8_t>(options.main_mem_output);
+                std::copy(output_tmp.get(), output_tmp.get() + out_fmap_size, main_output.data());
+            }
+
+            return kcr_done;
+#else
+            auto &ctx = interpreter.context();
+            ctx.interpreter = &interpreter;
+            ctx.step = step;
+
+            if (options.main_mem_output.size)
+            {
+                auto main_output = interpreter.memory_at<uint8_t>(options.main_mem_output);
+                kpu_conv2d_output(options.layer, interpreter.dma_ch(), main_output.data(), kpu_plic_thunk, &ctx);
+            }
+            else
+            {
+                kpu_conv2d_normal(options.layer, kpu_plic_thunk, &ctx);
+            }
+
+            return kcr_async;
+#endif
+        }
+    }
+}
+}
--- a/lib/nncase/targets/neutral/neutral_ops.cpp
+++ b/lib/nncase/targets/neutral/neutral_ops.cpp
@ -0,0 +1,238 @@
+#include <kernels/neutral/neutral_kernels.h>
+#include <runtime/kernel_registry.h>
+#include <targets/neutral/neutral_ops_body.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+
+#define ELEM_SIZE_IMPL(type, KERNEL)  \
+    switch (runtime::get_bytes(type)) \
+    {                                 \
+    case 1:                           \
+        KERNEL(uint8_t);              \
+        break;                        \
+    case 2:                           \
+        KERNEL(uint16_t);             \
+        break;                        \
+    case 4:                           \
+        KERNEL(uint32_t);             \
+        break;                        \
+    default:                          \
+        return kcr_error;             \
+    }
+
+namespace nncase
+{
+namespace targets
+{
+    namespace neutral
+    {
+        kernel_call_result binary(binary_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input_a = interpreter.memory_at<float>(options.input_a);
+            auto input_b = interpreter.memory_at<float>(options.input_b);
+            auto output = interpreter.memory_at<float>(options.output);
+
+            auto binary = [&](auto op) {
+                kernels::neutral::binary(input_a.data(), input_b.data(), output.data(), options.in_a_shape, options.in_b_shape, options.out_shape, options.fused_activation, op);
+            };
+
+            switch (options.binary_op)
+            {
+            case binary_add:
+                binary([](auto a, auto b) { return a + b; });
+                return kcr_done;
+            case binary_sub:
+                binary([](auto a, auto b) { return a - b; });
+                return kcr_done;
+            case binary_mul:
+                binary([](auto a, auto b) { return a * b; });
+                return kcr_done;
+            case binary_div:
+                binary([](auto a, auto b) { return a / b; });
+                return kcr_done;
+            default:
+                return kcr_error;
+            }
+        }
+
+        kernel_call_result concat(concat_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto output = interpreter.memory_at<uint8_t>(options.output);
+            kernels::neutral::concat(options.inputs, output.data(), options.dims, options.inner_size, options.outer_size,
+                [&](const memory_range &range) { return interpreter.memory_at<uint8_t>(range).data(); });
+            return kcr_done;
+        }
+
+        kernel_call_result conv2d(conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<float>(options.input);
+            auto output = interpreter.memory_at<float>(options.output);
+            kernels::neutral::conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.groups, options.out_channels, options.filter_h,
+                options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation);
+            return kcr_done;
+        }
+
+        kernel_call_result dequantize(dequantize_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<uint8_t>(options.input);
+            auto output = interpreter.memory_at<float>(options.output);
+
+            kernels::neutral::dequantize(input.data(), output.data(), input.size(), options.quant_param);
+            return kcr_done;
+        }
+
+        kernel_call_result matmul(matmul_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input_a = interpreter.memory_at<float>(options.input_a);
+            auto input_b = interpreter.memory_at<float>(options.input_b);
+            auto output = interpreter.memory_at<float>(options.output);
+            kernels::neutral::matmul(input_a.data(), input_b.data(), output.data(), options.bias.data(), options.a_rows, options.a_cols, options.b_cols, options.fused_activation);
+            return kcr_done;
+        }
+
+        kernel_call_result memory_copy(memory_copy_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<float>(options.input);
+            auto output = interpreter.memory_at<float>(options.output);
+
+            std::copy(input.begin(), input.end(), output.begin());
+            return kcr_done;
+        }
+
+        kernel_call_result pad(pad_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<uint8_t>(options.input);
+            auto output = interpreter.memory_at<uint8_t>(options.output);
+
+#define PAD_KERNEL(T) \
+    kernels::neutral::pad(reinterpret_cast<const T *>(input.data()), reinterpret_cast<T *>(output.data()), options.in_shape, options.paddings, options.pad_value.as<T>());
+
+            ELEM_SIZE_IMPL(options.input.datatype, PAD_KERNEL);
+            return kcr_done;
+#undef PAD_KERNEL
+        }
+
+        kernel_call_result quantize(quantize_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<float>(options.input);
+            auto output = interpreter.memory_at<uint8_t>(options.output);
+
+            kernels::neutral::quantize(input.data(), output.data(), input.size(), options.quant_param);
+            return runtime::kcr_done;
+        }
+
+        kernel_call_result reduce(reduce_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<float>(options.input);
+            auto output = interpreter.memory_at<float>(options.output);
+
+            auto reduce = [&](auto op) {
+                kernels::neutral::reduce(input.data(), output.data(), options.init_value, options.in_shape, options.out_shape, op);
+            };
+
+            switch (options.reduce_op)
+            {
+            case reduce_mean:
+            {
+                reduce([](auto a, auto b) { return a + b; });
+                auto mul = (float)output.size() / input.size();
+                kernels::neutral::unary(output.data(), output.data(), output.size(), [mul](auto a) { return a * mul; });
+                return kcr_done;
+            }
+            case reduce_min:
+                reduce([](auto a, auto b) { return std::min(a, b); });
+                return kcr_done;
+            case reduce_max:
+                reduce([](auto a, auto b) { return std::max(a, b); });
+                return kcr_done;
+            default:
+                return kcr_error;
+            }
+        }
+
+        kernel_call_result reduce_window2d(reduce_window2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<float>(options.input);
+            auto output = interpreter.memory_at<float>(options.output);
+
+            auto reduce = [&](auto binary_op, auto window_op) {
+                kernels::neutral::reduce_window2d(input.data(), output.data(), options.init_value, options.in_shape, options.filter_h, options.filter_w, options.stride_h,
+                    options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation, binary_op, window_op);
+            };
+
+            switch (options.reduce_op)
+            {
+            case reduce_mean:
+                reduce([](auto a, auto b) { return a + b; }, [](auto v, auto k) { return v / k; });
+                return kcr_done;
+            case reduce_min:
+                reduce([](auto a, auto b) { return std::min(a, b); }, [](auto v, auto k) { return v; });
+                return kcr_done;
+            case reduce_max:
+                reduce([](auto a, auto b) { return std::max(a, b); }, [](auto v, auto k) { return v; });
+                return kcr_done;
+            default:
+                return kcr_error;
+            }
+        }
+
+        kernel_call_result resize_bilinear(resize_bilinear_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<float>(options.input);
+            auto output = interpreter.memory_at<float>(options.output);
+
+            kernels::neutral::resize_bilinear(input.data(), output.data(), options.in_shape, options.out_h, options.out_w, options.align_corners);
+            return kcr_done;
+        }
+
+        kernel_call_result resize_nearest_neighbor(resize_nearest_neighbor_options &options, interpreter_t &interpreter, runtime::interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<uint8_t>(options.input);
+            auto output = interpreter.memory_at<uint8_t>(options.output);
+
+#define RESIZE_NN_KERNEL(T) \
+    kernels::neutral::resize_nearest_neighbor(reinterpret_cast<const T *>(input.data()), reinterpret_cast<T *>(output.data()), options.in_shape, options.out_h, options.out_w);
+
+            ELEM_SIZE_IMPL(options.input.datatype, RESIZE_NN_KERNEL);
+            return kcr_done;
+#undef RESIZE_NN_KERNEL
+        }
+
+        kernel_call_result softmax(softmax_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<float>(options.input);
+            auto output = interpreter.memory_at<float>(options.output);
+
+            kernels::neutral::softmax(input.data(), output.data(), options.beta, options.outer_size, options.inner_size);
+            return kcr_done;
+        }
+
+        kernel_call_result transpose(transpose_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<uint8_t>(options.input);
+            auto output = interpreter.memory_at<uint8_t>(options.output);
+
+#define TRANSPOSE_KERNEL(T) \
+    kernels::neutral::transpose(reinterpret_cast<const T *>(input.data()), reinterpret_cast<T *>(output.data()), options.in_shape, options.perm);
+
+            ELEM_SIZE_IMPL(options.input.datatype, TRANSPOSE_KERNEL);
+            return kcr_done;
+#undef TRANSPOSE_KERNEL
+        }
+
+        kernel_call_result strided_slice(strided_slice_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<uint8_t>(options.input);
+            auto output = interpreter.memory_at<uint8_t>(options.output);
+
+#define STRIDED_SLICE_KERNEL(T) \
+    kernels::neutral::strided_slice(reinterpret_cast<const T *>(input.data()), reinterpret_cast<T *>(output.data()), options.in_shape, options.begin, options.end, options.strides);
+
+            ELEM_SIZE_IMPL(options.input.datatype, STRIDED_SLICE_KERNEL);
+            return kcr_done;
+#undef STRIDED_SLICE_KERNEL
+        }
+    }
+}
+}
--- a/third_party/xtl/LICENSE
+++ b/third_party/xtl/LICENSE
@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2017, Sylvain Corlay and Johan Mabille
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/third_party/xtl/README.md
+++ b/third_party/xtl/README.md
@ -0,0 +1,66 @@
+# ![xtl](docs/source/xtl.svg)
+
+[![Travis](https://travis-ci.org/QuantStack/xtl.svg?branch=master)](https://travis-ci.org/QuantStack/xtl)
+[![Appveyor](https://ci.appveyor.com/api/projects/status/g9bldap2wirlue9w?svg=true)](https://ci.appveyor.com/project/QuantStack/xtl)
+[![Azure](https://dev.azure.com/johanmabille/johanmabille/_apis/build/status/QuantStack.xtl?branchName=master)](https://dev.azure.com/johanmabille/johanmabille/_build/latest?definitionId=1&branchName=master)
+[![Documentation Status](http://readthedocs.org/projects/xtl/badge/?version=latest)](https://xtl.readthedocs.io/en/latest/?badge=latest)
+[![Join the Gitter Chat](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/QuantStack/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+
+Basic tools (containers, algorithms) used by other quantstack packages
+
+## Installation
+
+`xtl` is a header-only library. We provide a package for the conda package manager.
+
+```bash
+conda install -c conda-forge xtl
+```
+
+Or you can directly install it from the sources:
+
+```bash
+cmake -DCMAKE_INSTALL_PREFIX=your_install_prefix
+make install
+```
+
+## Documentation
+
+To get started with using `xtl`, check out the full documentation
+
+http://xtl.readthedocs.io/
+
+
+## Building the HTML documentation
+
+xtl's documentation is built with three tools
+
+ - [doxygen](http://www.doxygen.org)
+ - [sphinx](http://www.sphinx-doc.org)
+ - [breathe](https://breathe.readthedocs.io)
+
+While doxygen must be installed separately, you can install breathe by typing
+
+```bash
+pip install breathe
+```
+
+Breathe can also be installed with `conda`
+
+```bash
+conda install -c conda-forge breathe
+```
+
+Finally, build the documentation with
+
+```bash
+make html
+```
+
+from the `docs` subdirectory.
+
+## License
+
+We use a shared copyright model that enables all contributors to maintain the
+copyright on their contributions.
+
+This software is licensed under the BSD-3-Clause license. See the [LICENSE](LICENSE) file for details.
--- a/third_party/xtl/include/xtl/xspan.hpp
+++ b/third_party/xtl/include/xtl/xspan.hpp
@ -0,0 +1,20 @@
+/***************************************************************************
+* Copyright (c) 2016, Sylvain Corlay and Johan Mabille                     *
+*                                                                          *
+* Distributed under the terms of the BSD 3-Clause License.                 *
+*                                                                          *
+* The full license is in the file LICENSE, distributed with this software. *
+****************************************************************************/
+
+#ifndef XTL_XSPAN_HPP
+#define XTL_XSPAN_HPP
+
+#include "xspan_impl.hpp"
+
+namespace xtl
+{
+	using tcb::span;
+	constexpr std::ptrdiff_t dynamic_extent = tcb::dynamic_extent;
+}
+
+#endif
--- a/third_party/xtl/include/xtl/xspan_impl.hpp
+++ b/third_party/xtl/include/xtl/xspan_impl.hpp
@ -0,0 +1,778 @@
+// https://github.com/tcbrindle/span/blob/master/include/tcb/span.hpp
+// TCP SPAN @commit cd0c6d0
+
+/*
+This is an implementation of std::span from P0122R7
+http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0122r7.pdf
+*/
+
+//          Copyright Tristan Brindle 2018.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file ../../LICENSE_1_0.txt or copy at
+//          https://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef TCB_SPAN_HPP_INCLUDED
+#define TCB_SPAN_HPP_INCLUDED
+
+#include <array>
+#include <cstddef>
+#include <type_traits>
+
+#ifndef TCB_SPAN_NO_EXCEPTIONS
+// Attempt to discover whether we're being compiled with exception support
+#if !(defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND))
+#define TCB_SPAN_NO_EXCEPTIONS
+#endif
+#endif
+
+#ifndef TCB_SPAN_NO_EXCEPTIONS
+#include <cstdio>
+#include <stdexcept>
+#endif
+
+// Various feature test macros
+
+#ifndef TCB_SPAN_NAMESPACE_NAME
+#define TCB_SPAN_NAMESPACE_NAME tcb
+#endif
+
+#ifdef TCB_SPAN_STD_COMPLIANT_MODE
+#define TCB_SPAN_NO_DEPRECATION_WARNINGS
+#endif
+
+#ifndef TCB_SPAN_NO_DEPRECATION_WARNINGS
+#define TCB_SPAN_DEPRECATED_FOR(msg) [[deprecated(msg)]]
+#else
+#define TCB_SPAN_DEPRECATED_FOR(msg)
+#endif
+
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#define TCB_SPAN_HAVE_CPP17
+#endif
+
+#if __cplusplus >= 201402L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L)
+#define TCB_SPAN_HAVE_CPP14
+#endif
+
+namespace TCB_SPAN_NAMESPACE_NAME {
+
+// Establish default contract checking behavior
+#if !defined(TCB_SPAN_THROW_ON_CONTRACT_VIOLATION) &&                          \
+    !defined(TCB_SPAN_TERMINATE_ON_CONTRACT_VIOLATION) &&                      \
+    !defined(TCB_SPAN_NO_CONTRACT_CHECKING)
+#if defined(NDEBUG) || !defined(TCB_SPAN_HAVE_CPP14)
+#define TCB_SPAN_NO_CONTRACT_CHECKING
+#else
+#define TCB_SPAN_TERMINATE_ON_CONTRACT_VIOLATION
+#endif
+#endif
+
+#if defined(TCB_SPAN_THROW_ON_CONTRACT_VIOLATION)
+struct contract_violation_error : std::logic_error {
+    explicit contract_violation_error(const char* msg) : std::logic_error(msg)
+    {}
+};
+
+inline void contract_violation(const char* msg)
+{
+    throw contract_violation_error(msg);
+}
+
+#elif defined(TCB_SPAN_TERMINATE_ON_CONTRACT_VIOLATION)
+[[noreturn]] inline void contract_violation(const char* /*unused*/)
+{
+    std::terminate();
+}
+#endif
+
+#if !defined(TCB_SPAN_NO_CONTRACT_CHECKING)
+#define TCB_SPAN_STRINGIFY(cond) #cond
+#define TCB_SPAN_EXPECT(cond)                                                  \
+    cond ? (void) 0 : contract_violation("Expected " TCB_SPAN_STRINGIFY(cond))
+#else
+#define TCB_SPAN_EXPECT(cond)
+#endif
+
+#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_inline_variables)
+#define TCB_SPAN_INLINE_VAR inline
+#else
+#define TCB_SPAN_INLINE_VAR
+#endif
+
+#if defined(TCB_SPAN_HAVE_CPP14) ||                                                 \
+    (defined(__cpp_constexpr) && __cpp_constexpr >= 201304)
+#define TCB_SPAN_CONSTEXPR14 constexpr
+#else
+#define TCB_SPAN_CONSTEXPR14
+#endif
+
+#if defined(TCB_SPAN_NO_CONTRACT_CHECKING)
+#define TCB_SPAN_CONSTEXPR11 constexpr
+#else
+#define TCB_SPAN_CONSTEXPR11 TCB_SPAN_CONSTEXPR14
+#endif
+
+#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_deduction_guides)
+#define TCB_SPAN_HAVE_DEDUCTION_GUIDES
+#endif
+
+#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_lib_byte)
+#define TCB_SPAN_HAVE_STD_BYTE
+#endif
+
+#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_lib_array_constexpr)
+#define TCB_SPAN_HAVE_CONSTEXPR_STD_ARRAY_ETC
+#endif
+
+#if defined(TCB_SPAN_HAVE_CONSTEXPR_STD_ARRAY_ETC)
+#define TCB_SPAN_ARRAY_CONSTEXPR constexpr
+#else
+#define TCB_SPAN_ARRAY_CONSTEXPR
+#endif
+
+#ifdef TCB_SPAN_HAVE_STD_BYTE
+using byte = std::byte;
+#else
+using byte = unsigned char;
+#endif
+
+TCB_SPAN_INLINE_VAR constexpr std::ptrdiff_t dynamic_extent = -1;
+
+template <typename ElementType, std::ptrdiff_t Extent = dynamic_extent>
+class span;
+
+namespace detail {
+
+template <typename E, std::ptrdiff_t S>
+struct span_storage {
+    constexpr span_storage() noexcept = default;
+
+    constexpr span_storage(E* ptr, std::ptrdiff_t /*unused*/) noexcept
+        : ptr(ptr)
+    {}
+
+    E* ptr = nullptr;
+    static constexpr std::ptrdiff_t size = S;
+};
+
+template <typename E>
+struct span_storage<E, dynamic_extent> {
+    constexpr span_storage() noexcept = default;
+
+    constexpr span_storage(E* ptr, std::size_t size) noexcept
+        : ptr(ptr), size(size)
+    {}
+
+    E* ptr = nullptr;
+    std::size_t size = 0;
+};
+
+// Reimplementation of C++17 std::size() and std::data()
+#if defined(TCB_SPAN_HAVE_CPP17) ||                                            \
+    defined(__cpp_lib_nonmember_container_access)
+using std::data;
+using std::size;
+#else
+template <class C>
+constexpr auto size(const C& c) -> decltype(c.size())
+{
+    return c.size();
+}
+
+template <class T, std::size_t N>
+constexpr std::size_t size(const T (&)[N]) noexcept
+{
+    return N;
+}
+
+template <class C>
+constexpr auto data(C& c) -> decltype(c.data())
+{
+    return c.data();
+}
+
+template <class C>
+constexpr auto data(const C& c) -> decltype(c.data())
+{
+    return c.data();
+}
+
+template <class T, std::size_t N>
+constexpr T* data(T (&array)[N]) noexcept
+{
+    return array;
+}
+
+template <class E>
+constexpr const E* data(std::initializer_list<E> il) noexcept
+{
+    return il.begin();
+}
+#endif // TCB_SPAN_HAVE_CPP17
+
+#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_lib_void_t)
+using std::void_t;
+#else
+template <typename...>
+using void_t = void;
+#endif
+
+template <typename T>
+using uncvref_t =
+    typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+
+template <typename>
+struct is_span : std::false_type {};
+
+template <typename T, std::ptrdiff_t S>
+struct is_span<span<T, S>> : std::true_type {};
+
+template <typename>
+struct is_std_array : std::false_type {};
+
+template <typename T, std::size_t N>
+struct is_std_array<std::array<T, N>> : std::true_type {};
+
+template <typename, typename = void>
+struct has_size_and_data : std::false_type {};
+
+template <typename T>
+struct has_size_and_data<T, void_t<decltype(detail::size(std::declval<T>())),
+                                   decltype(detail::data(std::declval<T>()))>>
+    : std::true_type {};
+
+template <typename C, typename U = uncvref_t<C>>
+struct is_container {
+    static constexpr bool value =
+        !is_span<U>::value && !is_std_array<U>::value &&
+        !std::is_array<U>::value && has_size_and_data<C>::value;
+};
+
+template <typename T>
+using remove_pointer_t = typename std::remove_pointer<T>::type;
+
+template <typename, typename, typename = void>
+struct is_container_element_type_compatible : std::false_type {};
+
+template <typename T, typename E>
+struct is_container_element_type_compatible<
+    T, E, void_t<decltype(detail::data(std::declval<T>()))>>
+    : std::is_convertible<
+          remove_pointer_t<decltype(detail::data(std::declval<T>()))> (*)[],
+          E (*)[]> {};
+
+template <typename, typename = size_t>
+struct is_complete : std::false_type {};
+
+template <typename T>
+struct is_complete<T, decltype(sizeof(T))> : std::true_type {};
+
+} // namespace detail
+
+template <typename ElementType, std::ptrdiff_t Extent>
+class span {
+    static_assert(Extent == dynamic_extent || Extent >= 0,
+                  "A span must have an extent greater than or equal to zero, "
+                  "or a dynamic extent");
+    static_assert(std::is_object<ElementType>::value,
+                  "A span's ElementType must be an object type (not a "
+                  "reference type or void)");
+    static_assert(detail::is_complete<ElementType>::value,
+                  "A span's ElementType must be a complete type (not a forward "
+                  "declaration)");
+    static_assert(!std::is_abstract<ElementType>::value,
+                  "A span's ElementType cannot be an abstract class type");
+
+    using storage_type = detail::span_storage<ElementType, Extent>;
+
+public:
+    // constants and types
+    using element_type = ElementType;
+    using value_type = typename std::remove_cv<ElementType>::type;
+    using index_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+    using pointer = ElementType*;
+    using reference = ElementType&;
+    using iterator = pointer;
+    using const_iterator = const ElementType*;
+    using reverse_iterator = std::reverse_iterator<iterator>;
+    using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+    static constexpr index_type extent = static_cast<index_type>(Extent);
+
+    // [span.cons], span constructors, copy, assignment, and destructor
+    template <std::ptrdiff_t E = Extent,
+              typename std::enable_if<E <= 0, int>::type = 0>
+    constexpr span() noexcept
+    {}
+
+    TCB_SPAN_CONSTEXPR11 span(pointer ptr, index_type count)
+        : storage_(ptr, count)
+    {
+        TCB_SPAN_EXPECT(extent == dynamic_extent || count == extent);
+    }
+
+    TCB_SPAN_CONSTEXPR11 span(pointer first_elem, pointer last_elem)
+        : storage_(first_elem, last_elem - first_elem)
+    {
+        TCB_SPAN_EXPECT(extent == dynamic_extent ||
+                        last_elem - first_elem == extent);
+    }
+
+    template <
+        std::size_t N, std::ptrdiff_t E = Extent,
+        typename std::enable_if<
+            (E == dynamic_extent || static_cast<std::ptrdiff_t>(N) == E) &&
+                detail::is_container_element_type_compatible<
+                    element_type (&)[N], ElementType>::value,
+            int>::type = 0>
+    constexpr span(element_type (&arr)[N]) noexcept : storage_(arr, N)
+    {}
+
+    template <
+        std::size_t N, std::ptrdiff_t E = Extent,
+        typename std::enable_if<
+            (E == dynamic_extent || static_cast<std::ptrdiff_t>(N) == E) &&
+                detail::is_container_element_type_compatible<
+                    std::array<value_type, N>&, ElementType>::value,
+            int>::type = 0>
+    TCB_SPAN_ARRAY_CONSTEXPR span(std::array<value_type, N>& arr) noexcept
+        : storage_(arr.data(), N)
+    {}
+
+    template <
+        std::size_t N, std::ptrdiff_t E = Extent,
+        typename std::enable_if<
+            (E == dynamic_extent || static_cast<std::ptrdiff_t>(N) == E) &&
+                detail::is_container_element_type_compatible<
+                    const std::array<value_type, N>&, ElementType>::value,
+            int>::type = 0>
+    TCB_SPAN_ARRAY_CONSTEXPR span(const std::array<value_type, N>& arr) noexcept
+        : storage_(arr.data(), N)
+    {}
+
+    template <typename Container,
+              typename std::enable_if<
+                  detail::is_container<Container>::value &&
+                      detail::is_container_element_type_compatible<
+                          Container&, ElementType>::value,
+                  int>::type = 0>
+    TCB_SPAN_CONSTEXPR11 span(Container& cont)
+        : storage_(detail::data(cont), detail::size(cont))
+    {
+        TCB_SPAN_EXPECT(extent == dynamic_extent ||
+                        static_cast<std::ptrdiff_t>(detail::size(cont)) ==
+                            extent);
+    }
+
+    template <typename Container,
+              typename std::enable_if<
+                  detail::is_container<Container>::value &&
+                      detail::is_container_element_type_compatible<
+                          const Container&, ElementType>::value,
+                  int>::type = 0>
+    TCB_SPAN_CONSTEXPR11 span(const Container& cont)
+        : storage_(detail::data(cont), detail::size(cont))
+    {
+        TCB_SPAN_EXPECT(extent == dynamic_extent ||
+                        static_cast<std::ptrdiff_t>(detail::size(cont)) ==
+                            extent);
+    }
+
+    constexpr span(const span& other) noexcept = default;
+
+    template <typename OtherElementType, std::ptrdiff_t OtherExtent,
+              typename std::enable_if<
+                  (Extent == OtherExtent || Extent == dynamic_extent) &&
+                      std::is_convertible<OtherElementType (*)[],
+                                          ElementType (*)[]>::value,
+                  int>::type = 0>
+    constexpr span(const span<OtherElementType, OtherExtent>& other) noexcept
+        : storage_(other.data(), other.size())
+    {}
+
+    ~span() noexcept = default;
+
+    span& operator=(const span& other) noexcept = default;
+
+    // [span.sub], span subviews
+    template <std::ptrdiff_t Count>
+    TCB_SPAN_CONSTEXPR11 span<element_type, Count> first() const
+    {
+        TCB_SPAN_EXPECT(Count >= 0 && Count <= size());
+        return {data(), Count};
+    }
+
+    template <std::ptrdiff_t Count>
+    TCB_SPAN_CONSTEXPR11 span<element_type, Count> last() const
+    {
+        TCB_SPAN_EXPECT(Count >= 0 && Count <= size());
+        return {data() + (size() - Count), Count};
+    }
+
+    template <std::ptrdiff_t Offset, std::ptrdiff_t Count = dynamic_extent>
+    using subspan_return_t =
+        span<ElementType, Count != dynamic_extent
+                              ? Count
+                              : (Extent != dynamic_extent ? Extent - Offset
+                                                          : dynamic_extent)>;
+
+    template <std::ptrdiff_t Offset, std::ptrdiff_t Count = dynamic_extent>
+    TCB_SPAN_CONSTEXPR11 subspan_return_t<Offset, Count> subspan() const
+    {
+        TCB_SPAN_EXPECT((Offset >= 0 && Offset <= size()) &&
+                        (Count == dynamic_extent ||
+                         (Count >= 0 && Offset + Count <= size())));
+        return {data() + Offset,
+                Count != dynamic_extent
+                    ? Count
+                    : (Extent != dynamic_extent ? Extent - Offset
+                                                : size() - Offset)};
+    }
+
+    TCB_SPAN_CONSTEXPR11 span<element_type, dynamic_extent>
+    first(index_type count) const
+    {
+        TCB_SPAN_EXPECT(count >= 0 && count <= size());
+        return {data(), count};
+    }
+
+    TCB_SPAN_CONSTEXPR11 span<element_type, dynamic_extent>
+    last(index_type count) const
+    {
+        TCB_SPAN_EXPECT(count >= 0 && count <= size());
+        return {data() + (size() - count), count};
+    }
+
+    TCB_SPAN_CONSTEXPR11 span<element_type, dynamic_extent>
+    subspan(index_type offset, index_type count = static_cast<index_type>(dynamic_extent)) const
+    {
+        TCB_SPAN_EXPECT((offset >= 0 && offset <= size()) &&
+                        (count == dynamic_extent ||
+                         (count >= 0 && offset + count <= size())));
+        return {data() + offset,
+                count == dynamic_extent ? size() - offset : count};
+    }
+
+    // [span.obs], span observers
+    constexpr index_type size() const noexcept { return storage_.size; }
+
+    constexpr index_type size_bytes() const noexcept
+    {
+        return size() * sizeof(element_type);
+    }
+
+    constexpr bool empty() const noexcept { return size() == 0; }
+
+    // [span.elem], span element access
+    TCB_SPAN_CONSTEXPR11 reference operator[](index_type idx) const
+    {
+        TCB_SPAN_EXPECT(idx >= 0 && idx < size());
+        return *(data() + idx);
+    }
+
+    /* Extension: not in P0122 */
+#ifndef TCB_SPAN_STD_COMPLIANT_MODE
+    TCB_SPAN_CONSTEXPR14 reference at(index_type idx) const
+    {
+#ifndef TCB_SPAN_NO_EXCEPTIONS
+        if (idx < 0 || idx >= size()) {
+            char msgbuf[64] = {
+                0,
+            };
+            std::snprintf(msgbuf, sizeof(msgbuf),
+                          "Index %td is out of range for span of size %td", idx,
+                          size());
+            throw std::out_of_range{msgbuf};
+        }
+#endif // TCB_SPAN_NO_EXCEPTIONS
+        return this->operator[](idx);
+    }
+
+    TCB_SPAN_CONSTEXPR11 reference front() const
+    {
+        TCB_SPAN_EXPECT(!empty());
+        return *data();
+    }
+
+    TCB_SPAN_CONSTEXPR11 reference back() const
+    {
+        TCB_SPAN_EXPECT(!empty());
+        return *(data() + (size() - 1));
+    }
+
+#endif // TCB_SPAN_STD_COMPLIANT_MODE
+
+#ifndef TCB_SPAN_NO_FUNCTION_CALL_OPERATOR
+    TCB_SPAN_DEPRECATED_FOR("Use operator[] instead")
+    constexpr reference operator()(index_type idx) const
+    {
+        return this->operator[](idx);
+    }
+#endif // TCB_SPAN_NO_FUNCTION_CALL_OPERATOR
+
+    constexpr pointer data() const noexcept { return storage_.ptr; }
+
+    // [span.iterators], span iterator support
+    constexpr iterator begin() const noexcept { return data(); }
+
+    constexpr iterator end() const noexcept { return data() + size(); }
+
+    constexpr const_iterator cbegin() const noexcept { return begin(); }
+
+    constexpr const_iterator cend() const noexcept { return end(); }
+
+    TCB_SPAN_ARRAY_CONSTEXPR reverse_iterator rbegin() const noexcept
+    {
+        return reverse_iterator(end());
+    }
+
+    TCB_SPAN_ARRAY_CONSTEXPR reverse_iterator rend() const noexcept
+    {
+        return reverse_iterator(begin());
+    }
+
+    TCB_SPAN_ARRAY_CONSTEXPR const_reverse_iterator crbegin() const noexcept
+    {
+        return const_reverse_iterator(cend());
+    }
+
+    TCB_SPAN_ARRAY_CONSTEXPR const_reverse_iterator crend() const noexcept
+    {
+        return const_reverse_iterator(cbegin());
+    }
+
+private:
+    storage_type storage_{};
+};
+
+#ifdef TCB_SPAN_HAVE_DEDUCTION_GUIDES
+
+/* Deduction Guides */
+template <class T, size_t N>
+span(T (&)[N])->span<T, N>;
+
+template <class T, size_t N>
+span(std::array<T, N>&)->span<T, N>;
+
+template <class T, size_t N>
+span(const std::array<T, N>&)->span<const T, N>;
+
+template <class Container>
+span(Container&)->span<typename Container::value_type>;
+
+template <class Container>
+span(const Container&)->span<const typename Container::value_type>;
+
+#endif // TCB_HAVE_DEDUCTION_GUIDES
+
+template <typename ElementType, std::ptrdiff_t Extent>
+constexpr span<ElementType, Extent>
+make_span(span<ElementType, Extent> s) noexcept
+{
+    return s;
+}
+
+#define AS_SIGNED(N) static_cast<std::ptrdiff_t>(N)
+
+template <typename T, std::size_t N>
+constexpr span<T, AS_SIGNED(N)> make_span(T (&arr)[N]) noexcept
+{
+    return {arr};
+}
+
+template <typename T, std::size_t N>
+TCB_SPAN_ARRAY_CONSTEXPR span<T, AS_SIGNED(N)> make_span(std::array<T, N>& arr) noexcept
+{
+    return {arr};
+}
+
+template <typename T, std::size_t N>
+TCB_SPAN_ARRAY_CONSTEXPR span<const T, AS_SIGNED(N)>
+make_span(const std::array<T, N>& arr) noexcept
+{
+    return {arr};
+}
+
+#undef AS_SIGNED
+
+template <typename Container>
+constexpr span<typename Container::value_type> make_span(Container& cont)
+{
+    return {cont};
+}
+
+template <typename Container>
+constexpr span<const typename Container::value_type>
+make_span(const Container& cont)
+{
+    return {cont};
+}
+
+/* Comparison operators */
+// Implementation note: the implementations of == and < are equivalent to
+// 4-legged std::equal and std::lexicographical_compare respectively
+
+template <typename T, std::ptrdiff_t X, typename U, std::ptrdiff_t Y>
+TCB_SPAN_CONSTEXPR14 bool operator==(span<T, X> lhs, span<U, Y> rhs)
+{
+    if (lhs.size() != rhs.size()) {
+        return false;
+    }
+
+    for (std::ptrdiff_t i = 0; i < lhs.size(); i++) {
+        if (lhs[i] != rhs[i]) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+template <typename T, std::ptrdiff_t X, typename U, std::ptrdiff_t Y>
+TCB_SPAN_CONSTEXPR14 bool operator!=(span<T, X> lhs, span<U, Y> rhs)
+{
+    return !(lhs == rhs);
+}
+
+template <typename T, std::ptrdiff_t X, typename U, std::ptrdiff_t Y>
+TCB_SPAN_CONSTEXPR14 bool operator<(span<T, X> lhs, span<U, Y> rhs)
+{
+    // No std::min to avoid dragging in <algorithm>
+    const std::ptrdiff_t size =
+        lhs.size() < rhs.size() ? lhs.size() : rhs.size();
+
+    for (std::ptrdiff_t i = 0; i < size; i++) {
+        if (lhs[i] < rhs[i]) {
+            return true;
+        }
+        if (lhs[i] > rhs[i]) {
+            return false;
+        }
+    }
+    return lhs.size() < rhs.size();
+}
+
+template <typename T, std::ptrdiff_t X, typename U, std::ptrdiff_t Y>
+TCB_SPAN_CONSTEXPR14 bool operator<=(span<T, X> lhs, span<U, Y> rhs)
+{
+    return !(rhs < lhs);
+}
+
+template <typename T, std::ptrdiff_t X, typename U, std::ptrdiff_t Y>
+TCB_SPAN_CONSTEXPR14 bool operator>(span<T, X> lhs, span<U, Y> rhs)
+{
+    return rhs < lhs;
+}
+
+template <typename T, std::ptrdiff_t X, typename U, std::ptrdiff_t Y>
+TCB_SPAN_CONSTEXPR14 bool operator>=(span<T, X> lhs, span<U, Y> rhs)
+{
+    return !(lhs < rhs);
+}
+
+template <typename ElementType, std::ptrdiff_t Extent>
+span<const byte, ((Extent == dynamic_extent)
+                      ? dynamic_extent
+                      : (static_cast<ptrdiff_t>(sizeof(ElementType)) * Extent))>
+as_bytes(span<ElementType, Extent> s) noexcept
+{
+    return {reinterpret_cast<const byte*>(s.data()), s.size_bytes()};
+}
+
+template <
+    class ElementType, ptrdiff_t Extent,
+    typename std::enable_if<!std::is_const<ElementType>::value, int>::type = 0>
+span<byte, ((Extent == dynamic_extent)
+                ? dynamic_extent
+                : (static_cast<ptrdiff_t>(sizeof(ElementType)) * Extent))>
+as_writable_bytes(span<ElementType, Extent> s) noexcept
+{
+    return {reinterpret_cast<byte*>(s.data()), s.size_bytes()};
+}
+
+/* Extension: nonmember subview operations */
+
+#ifndef TCB_SPAN_STD_COMPLIANT_MODE
+
+template <std::ptrdiff_t Count, typename T>
+TCB_SPAN_CONSTEXPR11 auto first(T& t)
+    -> decltype(make_span(t).template first<Count>())
+{
+    return make_span(t).template first<Count>();
+}
+
+template <std::ptrdiff_t Count, typename T>
+TCB_SPAN_CONSTEXPR11 auto last(T& t)
+    -> decltype(make_span(t).template last<Count>())
+{
+    return make_span(t).template last<Count>();
+}
+
+template <std::ptrdiff_t Offset, std::ptrdiff_t Count = dynamic_extent,
+          typename T>
+TCB_SPAN_CONSTEXPR11 auto subspan(T& t)
+    -> decltype(make_span(t).template subspan<Offset, Count>())
+{
+    return make_span(t).template subspan<Offset, Count>();
+}
+
+template <typename T>
+TCB_SPAN_CONSTEXPR11 auto first(T& t, std::ptrdiff_t count)
+    -> decltype(make_span(t).first(count))
+{
+    return make_span(t).first(count);
+}
+
+template <typename T>
+TCB_SPAN_CONSTEXPR11 auto last(T& t, std::ptrdiff_t count)
+    -> decltype(make_span(t).last(count))
+{
+    return make_span(t).last(count);
+}
+
+template <typename T>
+TCB_SPAN_CONSTEXPR11 auto subspan(T& t, std::ptrdiff_t offset,
+                                  std::ptrdiff_t count = dynamic_extent)
+    -> decltype(make_span(t).subspan(offset, count))
+{
+    return make_span(t).subspan(offset, count);
+}
+
+#endif // TCB_SPAN_STD_COMPLIANT_MODE
+
+} // namespace TCB_SPAN_NAMESPACE_NAME
+
+/* Extension: support for C++17 structured bindings */
+
+#ifndef TCB_SPAN_STD_COMPLIANT_MODE
+
+namespace TCB_SPAN_NAMESPACE_NAME {
+
+template <std::ptrdiff_t N, typename E, std::ptrdiff_t S>
+constexpr auto get(span<E, S> s) -> decltype(s[N])
+{
+    return s[N];
+}
+
+} // namespace TCB_SPAN_NAMESPACE_NAME
+
+namespace std {
+
+template <typename E, ptrdiff_t S>
+class tuple_size<tcb::span<E, S>> : public integral_constant<size_t, static_cast<size_t>(S)> {};
+
+template <typename E>
+class tuple_size<tcb::span<E, tcb::dynamic_extent>>; // not defined
+
+template <size_t N, typename E, ptrdiff_t S>
+class tuple_element<N, tcb::span<E, S>> {
+public:
+    using type = E;
+};
+
+} // end namespace std
+
+#endif // TCB_SPAN_STD_COMPLIANT_MODE
+
+#endif // TCB_SPAN_HPP_INCLUDED