Add nncase v1.0 runtime

2021-06-09 15:43:37 +08:00 · 2021-06-09 15:43:37 +08:00 · d740f55894
parent 06a2ea71f2
commit d740f55894
118 changed files with 45154 additions and 165 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -21,6 +21,10 @@ include(./cmake/macros.internal.cmake)
 header_directories(${SDK_ROOT}/lib)
 header_directories(src/${PROJ})
 header_directories(kendryte-standalone-demo/${PROJ})
+
+add_subdirectory(third_party/gsl-lite)
+add_subdirectory(third_party/mpark-variant)
+add_subdirectory(third_party/nlohmann_json)
 # build library first
 add_subdirectory(lib)

--- a/lds/kendryte.ld
+++ b/lds/kendryte.ld
@ -113,7 +113,7 @@ SECTIONS
  {
    PROVIDE_HIDDEN (__init_array_start = .);
    KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
-    *(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors)
+    KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors))
    PROVIDE_HIDDEN (__init_array_end = .);
  } >ram AT>ram :ram_ro

--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@ -42,5 +42,5 @@ SET_SOURCE_FILES_PROPERTIES(${ASSEMBLY_FILES} PROPERTIES COMPILE_FLAGS "-x assem
 ADD_LIBRARY(kendryte
        ${LIB_SRC}
        )
-TARGET_LINK_LIBRARIES(kendryte PUBLIC nncase)
+TARGET_LINK_LIBRARIES(kendryte PUBLIC nncase-wrapper)
 SET_TARGET_PROPERTIES(kendryte PROPERTIES LINKER_LANGUAGE C)
--- a/lib/bsp/syscalls.c
+++ b/lib/bsp/syscalls.c
@ -574,6 +574,7 @@ handle_breakpoint(uintptr_t cause, uintptr_t epc, uintptr_t regs[32], uintptr_t
 uintptr_t __attribute__((weak))
 handle_misaligned_load(uintptr_t cause, uintptr_t epc, uintptr_t regs[32], uintptr_t fregs[32])
 {
+    dump_core("misaligned load", cause, epc, regs, fregs);
    /* notice this function only support 16bit or 32bit instruction */

    bool compressed = (*(unsigned short *)epc & 3) != 3;
@ -665,6 +666,7 @@ handle_fault_load(uintptr_t cause, uintptr_t epc, uintptr_t regs[32], uintptr_t
 uintptr_t __attribute__((weak))
 handle_misaligned_store(uintptr_t cause, uintptr_t epc, uintptr_t regs[32], uintptr_t fregs[32])
 {
+    dump_core("misaligned store", cause, epc, regs, fregs);
    /* notice this function only support 16bit or 32bit instruction */

    bool compressed = (*(unsigned short *)epc & 3) != 3;
--- a/lib/drivers/include/kpu.h
+++ b/lib/drivers/include/kpu.h
@ -691,6 +691,7 @@ typedef struct
        struct
        {
            void* nncase_ctx;
+            uint32_t nncase_version;
        };
    };
 } kpu_model_context_t;
--- a/lib/nncase/CMakeLists.txt
+++ b/lib/nncase/CMakeLists.txt
@ -1,11 +1,6 @@
-include_directories(${SDK_ROOT}/third_party/xtl/include ${CMAKE_CURRENT_LIST_DIR}/nncase/include)
+add_subdirectory(v0)
+add_subdirectory(v1)

-FILE(GLOB_RECURSE NNCASE_SRC
-        "${CMAKE_CURRENT_LIST_DIR}/*.c"
-        "${CMAKE_CURRENT_LIST_DIR}/*.cpp"
-        )
-
-ADD_LIBRARY(nncase
-        ${NNCASE_SRC}
-        )
-TARGET_COMPILE_OPTIONS(nncase PRIVATE -O2)
+add_library(nncase-wrapper STATIC nncase.cpp)
+target_link_libraries(nncase-wrapper PRIVATE nncase-v0 nncase-v1)
+target_include_directories(nncase-wrapper PUBLIC include)
--- a/lib/nncase/nncase.cpp
+++ b/lib/nncase/nncase.cpp
@ -12,172 +12,51 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include <nncase.h>
-#include <kernels/k210/k210_kernels.h>
-#include <runtime/target_interpreter.h>
-#include <stdio.h>
+#include "v0/nncase_v0.h"
+#include "v1/nncase_v1.h"
 #include <cstring>
+#include <nncase.h>
+#include <stdio.h>
 #include <utils.h>

-using namespace nncase;
-using namespace nncase::runtime;
-
-#define NNCASE_DEBUG 0
-
-namespace
+extern "C"
 {
-void kpu_upload_dma(dmac_channel_number_t dma_ch, const uint8_t *src, uint8_t *dest, size_t input_size, plic_irq_callback_t callback, void *userdata)
-{
-    if (is_memory_cache((uintptr_t)src))
+    struct model_header
    {
-        std::copy_n(src, input_size, dest);
-        src -= 0x40000000;
+        uint32_t identifier;
+        uint32_t version;
+    };
+
+    int nncase_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
+    {
+        auto header = reinterpret_cast<const model_header *>(buffer);
+        if (header->version == 4)
+            return nncase_v0_load_kmodel(ctx, buffer);
+        else
+            return nncase_v1_load_kmodel(ctx, buffer);
    }

-    dmac_set_irq(dma_ch, callback, userdata, 1);
-    dmac_set_single_mode(dma_ch, (void *)src, (void *)dest, DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
-        DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
-    dmac_wait_done(dma_ch);
-}
-}
-
-class nncase_context
-{
-public:
-    int load_kmodel(const uint8_t *buffer)
+    int nncase_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
    {
-        int ret = interpreter_.try_load_model(buffer) ? 0 : -1;
-
-        uint32_t size = interpreter_.model_size(buffer);
-        uint8_t *buffer_iomem = (uint8_t *)((uintptr_t)buffer - IOMEM);
-        const uint8_t *buffer_cache = buffer;
-        memcpy(buffer_iomem, buffer_cache, size);
-        for (int i = 0; i < size; i++)
-        {
-            if (buffer_iomem[i] != buffer_cache[i])
-            {
-                printf("flush model fail:%d %x %x \n", i, buffer_iomem[i], buffer_cache[i]);
-                while (1)
-                    ;
-            }
-        }
-        return ret;
+        if (ctx->nncase_version == 0)
+            return nncase_v0_get_output(ctx, index, data, size);
+        else
+            return nncase_v1_get_output(ctx, index, data, size);
    }

-    int get_output(uint32_t index, uint8_t **data, size_t *size)
+    void nncase_model_free(kpu_model_context_t *ctx)
    {
-        if (index >= interpreter_.outputs_size())
-            return -1;
-
-        auto mem = interpreter_.memory_at<uint8_t>(interpreter_.output_at(index));
-        *data = mem.data();
-        *size = mem.size();
-        return 0;
+        if (ctx->nncase_version == 0)
+            return nncase_v0_model_free(ctx);
+        else
+            return nncase_v1_model_free(ctx);
    }

-    int run_kmodel(const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
+    int nncase_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
    {
-        done_callback_ = done_callback;
-        userdata_ = userdata;
-        interpreter_.dma_ch(dma_ch);
-
-        auto input = interpreter_.input_at(0);
-        auto mem = interpreter_.memory_at<uint8_t>(input);
-        if (input.memory_type == mem_main)
-        {
-            std::copy(src, src + mem.size(), mem.begin());
-            interpreter_.run(done_thunk, on_error_thunk, node_profile_thunk, this);
-            return 0;
-        }
-        else if (input.memory_type == mem_k210_kpu)
-        {
-            auto shape = interpreter_.input_shape_at(0);
-            kernels::k210::kpu_upload(src, mem.data(), shape);
-            on_upload_done();
-
-            return 0;
-        }
-
-        return -1;
-    }
-
-private:
-    void on_done()
-    {
-#if NNCASE_DEBUG
-        printf("Total: %fms\n", interpreter_.total_duration().count() / 1e6);
-#endif
-
-        if (done_callback_)
-            done_callback_(userdata_);
-    }
-
-    void on_upload_done()
-    {
-        interpreter_.run(done_thunk, on_error_thunk, node_profile_thunk, this);
-    }
-
-    static void done_thunk(void *userdata)
-    {
-        reinterpret_cast<nncase_context *>(userdata)->on_done();
-    }
-
-    static void on_error_thunk(const char *err, void *userdata)
-    {
-#if NNCASE_DEBUG
-        printf("Fatal: %s\n", err);
-#endif
-    }
-
-    static void node_profile_thunk(runtime_opcode op, std::chrono::nanoseconds duration, void *userdata)
-    {
-#if NNCASE_DEBUG
-        printf("%s: %fms\n", node_opcode_names(op).data(), duration.count() / 1e6);
-#endif
-    }
-
-    static int upload_done_thunk(void *userdata)
-    {
-        reinterpret_cast<nncase_context *>(userdata)->on_upload_done();
-        return 0;
-    }
-
-private:
-    interpreter_t interpreter_;
-    kpu_done_callback_t done_callback_;
-    void *userdata_;
-};
-
-int nncase_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
-{
-    auto nnctx = new (std::nothrow) nncase_context();
-    if (ctx)
-    {
-        ctx->is_nncase = 1;
-        ctx->nncase_ctx = nnctx;
-        return nnctx->load_kmodel(buffer);
-    }
-    else
-    {
-        return -1;
+        if (ctx->nncase_version == 0)
+            return nncase_v0_run_kmodel(ctx, src, dma_ch, done_callback, userdata);
+        else
+            return nncase_v1_run_kmodel(ctx, src, dma_ch, done_callback, userdata);
    }
 }
-
-int nncase_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
-{
-    auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
-    return nnctx->get_output(index, data, size);
-}
-
-void nncase_model_free(kpu_model_context_t *ctx)
-{
-    auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
-    delete nnctx;
-    ctx->nncase_ctx = nullptr;
-}
-
-int nncase_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
-{
-    auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
-    return nnctx->run_kmodel(src, dma_ch, done_callback, userdata);
-}
--- a/lib/nncase/v0/CMakeLists.txt
+++ b/lib/nncase/v0/CMakeLists.txt
@ -0,0 +1,11 @@
+include_directories(${SDK_ROOT}/third_party/xtl/include ${CMAKE_CURRENT_LIST_DIR}/nncase/include)
+
+FILE(GLOB_RECURSE NNCASE_SRC
+        "${CMAKE_CURRENT_LIST_DIR}/*.c"
+        "${CMAKE_CURRENT_LIST_DIR}/*.cpp"
+        )
+
+ADD_LIBRARY(nncase-v0
+        ${NNCASE_SRC}
+        )
+TARGET_COMPILE_OPTIONS(nncase-v0 PRIVATE -O2)
--- a/lib/nncase/v0/include/datatypes.h
+++ b/lib/nncase/v0/include/datatypes.h
--- a/lib/nncase/v0/include/io_utils.h
+++ b/lib/nncase/v0/include/io_utils.h
--- a/lib/nncase/v0/include/kernels/cpu/cpu_kernels.h
+++ b/lib/nncase/v0/include/kernels/cpu/cpu_kernels.h
--- a/lib/nncase/v0/include/kernels/k210/k210_kernels.h
+++ b/lib/nncase/v0/include/kernels/k210/k210_kernels.h
--- a/lib/nncase/v0/include/kernels/kernel_utils.h
+++ b/lib/nncase/v0/include/kernels/kernel_utils.h
--- a/lib/nncase/v0/include/kernels/neutral/neutral_kernels.h
+++ b/lib/nncase/v0/include/kernels/neutral/neutral_kernels.h
--- a/lib/nncase/v0/include/kernels/riscv/neutral_kernels.h
+++ b/lib/nncase/v0/include/kernels/riscv/neutral_kernels.h
--- a/lib/nncase/v0/include/quantize.h
+++ b/lib/nncase/v0/include/quantize.h
@ -0,0 +1,109 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "datatypes.h"
+#include <cassert>
+#include <cmath>
+#include <limits>
+
+namespace nncase
+{
+namespace quant
+{
+    template <class TIt>
+    value_range<float> get_range(TIt begin, TIt end)
+    {
+        float min = std::numeric_limits<float>::max();
+        float max = std::numeric_limits<float>::min();
+        while (begin != end)
+        {
+            auto value = *begin++;
+            auto fc = std::fpclassify(value);
+            if (fc == FP_NORMAL || fc == FP_SUBNORMAL || fc == FP_ZERO)
+            {
+                min = std::min(min, value);
+                max = std::max(max, value);
+            }
+        }
+
+        return { min, max };
+    }
+
+    inline value_range<float> fixup_range(value_range<float> range)
+    {
+        if (range.min < -1e3)
+            range.min = -1e3;
+        if (range.max > 1e3)
+            range.max = 1e3;
+        auto r = range.max - range.min;
+        if (r == 0)
+            r = 0.1f;
+        else if (r < 0.01f)
+            r = 0.01f;
+        range.max = range.min + r;
+
+        if (range.max < 0)
+            range.max = 0;
+        if (range.min > 0)
+            range.min = 0;
+        return range;
+    }
+
+    inline quant_param_t get_quant_param(value_range<float> range, int32_t bits)
+    {
+        range = fixup_range(range);
+        auto r = range.max - range.min;
+        auto scale = ((1LL << bits) - 1) / r;
+        auto bias = std::round(-range.min * scale);
+        assert(bias >= 0);
+        return { static_cast<int32_t>(bias), scale };
+    }
+
+    inline fixed_mul get_fixed_mul(float value, int32_t max_bits, uint8_t max_shift, bool is_signed)
+    {
+        assert(!is_signed || value >= 0);
+
+        auto bits = is_signed ? max_bits - 1 : max_bits;
+        int32_t shift = 0;
+        float mul = 0;
+
+        if (std::abs(value) > 1)
+        {
+            int mul_shift;
+            mul = std::frexp(value, &mul_shift);
+            shift = std::min((int32_t)max_shift, bits - mul_shift);
+            mul = mul * std::pow(2.f, shift + mul_shift);
+        }
+        else if (value == 0)
+        {
+            mul = 0;
+            shift = 0;
+        }
+        else
+        {
+            int mul_shift;
+            mul = std::frexp(value, &mul_shift);
+            shift = std::min(max_shift + mul_shift, bits);
+            mul = mul * std::pow(2.f, shift);
+            shift -= mul_shift;
+        }
+
+        assert(std::abs(mul) < std::pow(2, bits));
+        assert(shift >= 0 && shift <= max_shift);
+        assert(std::abs(value - mul * std::pow(2, -shift)) <= std::numeric_limits<float>::epsilon());
+        return { mul, static_cast<int8_t>(shift) };
+    }
+}
+}
--- a/lib/nncase/v0/include/runtime/binary_reader.h
+++ b/lib/nncase/v0/include/runtime/binary_reader.h
--- a/lib/nncase/v0/include/runtime/binary_writer.h
+++ b/lib/nncase/v0/include/runtime/binary_writer.h
--- a/lib/nncase/v0/include/runtime/cpu/cpu_ops_body.h
+++ b/lib/nncase/v0/include/runtime/cpu/cpu_ops_body.h
--- a/lib/nncase/v0/include/runtime/cpu/interpreter.h
+++ b/lib/nncase/v0/include/runtime/cpu/interpreter.h
--- a/lib/nncase/v0/include/runtime/interpreter.h
+++ b/lib/nncase/v0/include/runtime/interpreter.h
--- a/lib/nncase/v0/include/runtime/k210/interpreter.h
+++ b/lib/nncase/v0/include/runtime/k210/interpreter.h
--- a/lib/nncase/v0/include/runtime/k210/k210_ops_body.h
+++ b/lib/nncase/v0/include/runtime/k210/k210_ops_body.h
--- a/lib/nncase/v0/include/runtime/k210/k210_runtime_op_utility.h
+++ b/lib/nncase/v0/include/runtime/k210/k210_runtime_op_utility.h
--- a/lib/nncase/v0/include/runtime/k210/k210_sim_types.h
+++ b/lib/nncase/v0/include/runtime/k210/k210_sim_types.h
--- a/lib/nncase/v0/include/runtime/kernel_registry.h
+++ b/lib/nncase/v0/include/runtime/kernel_registry.h
--- a/lib/nncase/v0/include/runtime/model.h
+++ b/lib/nncase/v0/include/runtime/model.h
--- a/lib/nncase/v0/include/runtime/neutral/neutral_ops_body.h
+++ b/lib/nncase/v0/include/runtime/neutral/neutral_ops_body.h
--- a/lib/nncase/v0/include/runtime/neutral/neutral_sim_types.h
+++ b/lib/nncase/v0/include/runtime/neutral/neutral_sim_types.h
--- a/lib/nncase/v0/include/runtime/nnil.h
+++ b/lib/nncase/v0/include/runtime/nnil.h
--- a/lib/nncase/v0/include/runtime/node_body.h
+++ b/lib/nncase/v0/include/runtime/node_body.h
--- a/lib/nncase/v0/include/runtime/runtime_op.def
+++ b/lib/nncase/v0/include/runtime/runtime_op.def
--- a/lib/nncase/v0/include/runtime/runtime_op.h
+++ b/lib/nncase/v0/include/runtime/runtime_op.h
--- a/lib/nncase/v0/include/runtime/runtime_op_utility.h
+++ b/lib/nncase/v0/include/runtime/runtime_op_utility.h
--- a/lib/nncase/v0/include/runtime/span_reader.h
+++ b/lib/nncase/v0/include/runtime/span_reader.h
--- a/lib/nncase/v0/include/runtime/target_interpreter.h
+++ b/lib/nncase/v0/include/runtime/target_interpreter.h
--- a/lib/nncase/v0/include/target_config.h
+++ b/lib/nncase/v0/include/target_config.h
--- a/lib/nncase/v0/include/targets/target.h
+++ b/lib/nncase/v0/include/targets/target.h
--- a/lib/nncase/v0/nncase_v0.cpp
+++ b/lib/nncase/v0/nncase_v0.cpp
@ -0,0 +1,184 @@
+/* Copyright 2019 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <nncase_v0.h>
+#include <kernels/k210/k210_kernels.h>
+#include <runtime/target_interpreter.h>
+#include <stdio.h>
+#include <cstring>
+#include <utils.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+
+#define NNCASE_DEBUG 0
+
+namespace
+{
+void kpu_upload_dma(dmac_channel_number_t dma_ch, const uint8_t *src, uint8_t *dest, size_t input_size, plic_irq_callback_t callback, void *userdata)
+{
+    if (is_memory_cache((uintptr_t)src))
+    {
+        std::copy_n(src, input_size, dest);
+        src -= 0x40000000;
+    }
+
+    dmac_set_irq(dma_ch, callback, userdata, 1);
+    dmac_set_single_mode(dma_ch, (void *)src, (void *)dest, DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
+        DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
+    dmac_wait_done(dma_ch);
+}
+}
+
+class nncase_context
+{
+public:
+    int load_kmodel(const uint8_t *buffer)
+    {
+        int ret = interpreter_.try_load_model(buffer) ? 0 : -1;
+
+        uint32_t size = interpreter_.model_size(buffer);
+        uint8_t *buffer_iomem = (uint8_t *)((uintptr_t)buffer - IOMEM);
+        const uint8_t *buffer_cache = buffer;
+        memcpy(buffer_iomem, buffer_cache, size);
+        for (int i = 0; i < size; i++)
+        {
+            if (buffer_iomem[i] != buffer_cache[i])
+            {
+                printf("flush model fail:%d %x %x \n", i, buffer_iomem[i], buffer_cache[i]);
+                while (1)
+                    ;
+            }
+        }
+        return ret;
+    }
+
+    int get_output(uint32_t index, uint8_t **data, size_t *size)
+    {
+        if (index >= interpreter_.outputs_size())
+            return -1;
+
+        auto mem = interpreter_.memory_at<uint8_t>(interpreter_.output_at(index));
+        *data = mem.data();
+        *size = mem.size();
+        return 0;
+    }
+
+    int run_kmodel(const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
+    {
+        done_callback_ = done_callback;
+        userdata_ = userdata;
+        interpreter_.dma_ch(dma_ch);
+
+        auto input = interpreter_.input_at(0);
+        auto mem = interpreter_.memory_at<uint8_t>(input);
+        if (input.memory_type == mem_main)
+        {
+            std::copy(src, src + mem.size(), mem.begin());
+            interpreter_.run(done_thunk, on_error_thunk, node_profile_thunk, this);
+            return 0;
+        }
+        else if (input.memory_type == mem_k210_kpu)
+        {
+            auto shape = interpreter_.input_shape_at(0);
+            kernels::k210::kpu_upload(src, mem.data(), shape);
+            on_upload_done();
+
+            return 0;
+        }
+
+        return -1;
+    }
+
+private:
+    void on_done()
+    {
+#if NNCASE_DEBUG
+        printf("Total: %fms\n", interpreter_.total_duration().count() / 1e6);
+#endif
+
+        if (done_callback_)
+            done_callback_(userdata_);
+    }
+
+    void on_upload_done()
+    {
+        interpreter_.run(done_thunk, on_error_thunk, node_profile_thunk, this);
+    }
+
+    static void done_thunk(void *userdata)
+    {
+        reinterpret_cast<nncase_context *>(userdata)->on_done();
+    }
+
+    static void on_error_thunk(const char *err, void *userdata)
+    {
+#if NNCASE_DEBUG
+        printf("Fatal: %s\n", err);
+#endif
+    }
+
+    static void node_profile_thunk(runtime_opcode op, std::chrono::nanoseconds duration, void *userdata)
+    {
+#if NNCASE_DEBUG
+        printf("%s: %fms\n", node_opcode_names(op).data(), duration.count() / 1e6);
+#endif
+    }
+
+    static int upload_done_thunk(void *userdata)
+    {
+        reinterpret_cast<nncase_context *>(userdata)->on_upload_done();
+        return 0;
+    }
+
+private:
+    interpreter_t interpreter_;
+    kpu_done_callback_t done_callback_;
+    void *userdata_;
+};
+
+int nncase_v0_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
+{
+    auto nnctx = new (std::nothrow) nncase_context();
+    if (ctx)
+    {
+        ctx->is_nncase = 1;
+        ctx->nncase_ctx = nnctx;
+        ctx->nncase_version = 0;
+        return nnctx->load_kmodel(buffer);
+    }
+    else
+    {
+        return -1;
+    }
+}
+
+int nncase_v0_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
+{
+    auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
+    return nnctx->get_output(index, data, size);
+}
+
+void nncase_v0_model_free(kpu_model_context_t *ctx)
+{
+    auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
+    delete nnctx;
+    ctx->nncase_ctx = nullptr;
+}
+
+int nncase_v0_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
+{
+    auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
+    return nnctx->run_kmodel(src, dma_ch, done_callback, userdata);
+}
--- a/lib/nncase/v0/nncase_v0.h
+++ b/lib/nncase/v0/nncase_v0.h
@ -0,0 +1,33 @@
+/* Copyright 2019 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef _NNCASE_V0_H
+#define _NNCASE_V0_H
+
+#include "kpu.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int nncase_v0_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer);
+int nncase_v0_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size);
+void nncase_v0_model_free(kpu_model_context_t *ctx);
+int nncase_v0_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/lib/nncase/v0/runtime/cpu/cpu_ops.cpp
+++ b/lib/nncase/v0/runtime/cpu/cpu_ops.cpp
--- a/lib/nncase/v0/runtime/interpreter.cpp
+++ b/lib/nncase/v0/runtime/interpreter.cpp
--- a/lib/nncase/v0/runtime/k210/interpreter.cpp
+++ b/lib/nncase/v0/runtime/k210/interpreter.cpp
--- a/lib/nncase/v0/runtime/k210/k210_ops.cpp
+++ b/lib/nncase/v0/runtime/k210/k210_ops.cpp
--- a/lib/nncase/v0/runtime/kernel_registry.cpp
+++ b/lib/nncase/v0/runtime/kernel_registry.cpp
--- a/lib/nncase/v0/runtime/neutral/neutral_ops.cpp
+++ b/lib/nncase/v0/runtime/neutral/neutral_ops.cpp
--- a/lib/nncase/v1/CMakeLists.txt
+++ b/lib/nncase/v1/CMakeLists.txt
@ -0,0 +1,6 @@
+
+set(nncaseruntime_DIR ${CMAKE_CURRENT_LIST_DIR}/lib/cmake/nncaseruntime)
+find_package(nncaseruntime REQUIRED)
+
+add_library(nncase-v1 STATIC nncase_v1.cpp)
+target_link_libraries(nncase-v1 PRIVATE -Wl,-start-group nncaseruntime nncase_rt_modules_k210 -Wl,-end-group)
--- a/lib/nncase/v1/include/nncase/kernels/convolution.h
+++ b/lib/nncase/v1/include/nncase/kernels/convolution.h
@ -0,0 +1,28 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <nncase/runtime/datatypes.h>
+#include <nncase/runtime/error.h>
+#include <nncase/runtime/result.h>
+#include <nncase/kernels/kernel_context.h>
+
+BEGIN_NS_NNCASE_KERNELS
+
+NNCASE_API result<void> conv2d(const float *input, const float *weights, const float *bias, float *output,
+    const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &w_shape, const runtime_shape_t &w_strides,
+    const runtime_shape_t &bias_strides, const runtime_shape_t &out_strides, const padding &padding_h, const padding &padding_w,
+    int32_t groups, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, value_range<float> fused_activation, kernel_context &context = default_kernel_context) noexcept;
+
+END_NS_NNCASE_KERNELS
--- a/lib/nncase/v1/include/nncase/kernels/cpu/optimized/convolution.h
+++ b/lib/nncase/v1/include/nncase/kernels/cpu/optimized/convolution.h
@ -0,0 +1,28 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "runtime_types.h"
+#include <nncase/kernels/kernel_context.h>
+#include <nncase/kernels/kernel_utils.h>
+#include <utility>
+
+BEGIN_NS_NNCASE_KERNELS_CPU_OPT
+
+result<void> conv2d(const float *input, const float *weights, const float *bias, float *output,
+    const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &w_shape, const runtime_shape_t &w_strides,
+    const runtime_shape_t &bias_strides, const runtime_shape_t &out_strides, const padding &padding_h, const padding &padding_w,
+    int32_t groups, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, value_range<float> fused_activation, kernel_context &context) noexcept;
+
+END_NS_NNCASE_KERNELS_CPU_OPT
--- a/lib/nncase/v1/include/nncase/kernels/cpu/optimized/runtime_types.h
+++ b/lib/nncase/v1/include/nncase/kernels/cpu/optimized/runtime_types.h
@ -0,0 +1,54 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <nncase/kernels/kernel_context.h>
+#include <nncase/runtime/datatypes.h>
+#include <nncase/runtime/error.h>
+#include <nncase/runtime/result.h>
+
+#define BEGIN_NS_NNCASE_KERNELS_CPU_OPT \
+    namespace nncase                    \
+    {                                   \
+    namespace kernels                   \
+    {                                   \
+        namespace cpu                   \
+        {                               \
+            namespace optimized         \
+            {
+
+#define END_NS_NNCASE_KERNELS_CPU_OPT \
+    }                                 \
+    }                                 \
+    }                                 \
+    }
+
+#define TYPE_IMPL_SELECT(type, IMPL)          \
+    switch (runtime::get_bytes(type))         \
+    {                                         \
+        IMPL(1, uint8_t);                     \
+        IMPL(2, uint16_t);                    \
+        IMPL(4, uint32_t);                    \
+        IMPL(8, uint64_t);                    \
+    default:                                  \
+        return err(std::errc::not_supported); \
+    }
+
+enum copy_impl_select
+{
+    all_contiguous,
+    src_contiguous,
+    dest_contiguous
+};
--- a/lib/nncase/v1/include/nncase/kernels/cpu/optimized/tensor_compute.h
+++ b/lib/nncase/v1/include/nncase/kernels/cpu/optimized/tensor_compute.h
@ -0,0 +1,33 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "runtime_types.h"
+#include <cstring>
+
+BEGIN_NS_NNCASE_KERNELS_CPU_OPT
+
+NNCASE_API result<void> concat(datatype_t type, gsl::span<const gsl::byte *const> inputs, gsl::byte *output, const runtime_shape_t &out_shape,
+    gsl::span<const runtime_shape_t> in_strides, const runtime_shape_t &out_strides, size_t axis, const runtime_shape_t &concat_dims,
+    kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> slice(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const runtime_shape_t &begins, const runtime_shape_t &ends, const runtime_axis_t &strides,
+    kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> copy(datatype_t type, const gsl::byte *src, gsl::byte *dest,
+    const runtime_shape_t &shape, const runtime_shape_t &src_strides, const runtime_shape_t &dest_strides, 
+    int dims_offset, copy_impl_select impl_select, kernel_context &context = default_kernel_context) noexcept;
+
+END_NS_NNCASE_KERNELS_CPU_OPT
--- a/lib/nncase/v1/include/nncase/kernels/cpu/reference/convolution.h
+++ b/lib/nncase/v1/include/nncase/kernels/cpu/reference/convolution.h
@ -0,0 +1,26 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <nncase/kernels/kernel_context.h>
+#include "runtime_types.h"
+
+BEGIN_NS_NNCASE_KERNELS_CPU_REF
+
+NNCASE_API result<void> conv2d(const float *input, const float *weights, const float *bias, float *output,
+    const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &w_shape, const runtime_shape_t &w_strides,
+    const runtime_shape_t &bias_strides, const runtime_shape_t &out_strides, const padding &padding_h, const padding &padding_w,
+    int32_t groups, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, value_range<float> fused_activation, kernel_context &context = default_kernel_context) noexcept;
+
+END_NS_NNCASE_KERNELS_CPU_REF
--- a/lib/nncase/v1/include/nncase/kernels/cpu/reference/nnil.h
+++ b/lib/nncase/v1/include/nncase/kernels/cpu/reference/nnil.h
@ -0,0 +1,23 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "runtime_types.h"
+#include <nncase/kernels/kernel_context.h>
+
+BEGIN_NS_NNCASE_KERNELS_CPU_REF
+
+NNCASE_API result<void> nnil_unary_method(const float *input, float *output, size_t count, gsl::span<const gsl::byte> body, kernel_context &context = default_kernel_context) noexcept;
+
+END_NS_NNCASE_KERNELS_CPU_REF
--- a/lib/nncase/v1/include/nncase/kernels/cpu/reference/reduce_window.h
+++ b/lib/nncase/v1/include/nncase/kernels/cpu/reference/reduce_window.h
@ -0,0 +1,25 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "runtime_types.h"
+#include <nncase/kernels/kernel_context.h>
+
+BEGIN_NS_NNCASE_KERNELS_CPU_REF
+
+NNCASE_API result<void> reduce_window2d(reduce_op_t op, const float *input, float init_value, float *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const padding &padding_h, const padding &padding_w,
+    int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, value_range<float> fused_activation, kernel_context &context = default_kernel_context) noexcept;
+
+END_NS_NNCASE_KERNELS_CPU_REF
--- a/lib/nncase/v1/include/nncase/kernels/cpu/reference/runtime_types.h
+++ b/lib/nncase/v1/include/nncase/kernels/cpu/reference/runtime_types.h
@ -0,0 +1,72 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <nncase/runtime/datatypes.h>
+#include <nncase/runtime/error.h>
+#include <nncase/runtime/result.h>
+
+#define BEGIN_NS_NNCASE_KERNELS_CPU_REF \
+    namespace nncase                    \
+    {                                   \
+    namespace kernels                   \
+    {                                   \
+        namespace cpu                   \
+        {                               \
+            namespace reference         \
+            {
+
+#define END_NS_NNCASE_KERNELS_CPU_REF \
+    }                                 \
+    }                                 \
+    }                                 \
+    }
+
+BEGIN_NS_NNCASE_KERNELS_CPU_REF
+
+namespace detail
+{
+template <class Callable>
+result<void> apply_impl(Callable &&callable, runtime_shape_t index_prefix, runtime_shape_t::const_iterator index_begin, runtime_shape_t::const_iterator index_end) noexcept
+{
+    const auto head = *index_begin++;
+    index_prefix.push_back(0);
+    if (index_begin == index_end)
+    {
+        for (size_t i = 0; i < head; i++)
+        {
+            index_prefix.back() = i;
+            try_(callable(index_prefix));
+        }
+    }
+    else
+    {
+        for (size_t i = 0; i < head; i++)
+        {
+            index_prefix.back() = i;
+            try_(apply_impl(std::forward<Callable>(callable), index_prefix, index_begin, index_end));
+        }
+    }
+
+    return ok();
+}
+}
+
+template <class Callable>
+result<void> apply(const runtime_shape_t &shape, Callable &&callable) noexcept
+{
+    return detail::apply_impl(std::forward<Callable>(callable), runtime_shape_t(), shape.cbegin(), shape.cend());
+}
+
+END_NS_NNCASE_KERNELS_CPU_REF
--- a/lib/nncase/v1/include/nncase/kernels/cpu/reference/tensor_compute.h
+++ b/lib/nncase/v1/include/nncase/kernels/cpu/reference/tensor_compute.h
@ -0,0 +1,70 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <nncase/kernels/kernel_context.h>
+#include "runtime_types.h"
+
+BEGIN_NS_NNCASE_KERNELS_CPU_REF
+
+NNCASE_API result<void> batch_to_space(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &block_shape, const runtime_paddings_t &crops, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides,
+    kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> broadcast(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_shape, const runtime_shape_t &out_strides, kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> concat(datatype_t type, gsl::span<const gsl::byte *const> inputs, gsl::byte *output, const runtime_shape_t &out_shape,
+    gsl::span<const runtime_shape_t> in_strides, const runtime_shape_t &out_strides, size_t axis, const runtime_shape_t &concat_dims,
+    kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> convert(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
+    const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> copy(datatype_t type, const gsl::byte *src, gsl::byte *dest,
+    const runtime_shape_t &shape, const runtime_shape_t &src_strides, const runtime_shape_t &dest_strides, kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> transpose(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &perm, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> binary(binary_op_t op, const float *input_a, const float *input_b, float *output,
+    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
+    const runtime_shape_t &in_b_strides, const runtime_shape_t &out_strides, value_range<float> fused_activation, kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> dequantize(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
+    const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, float scale, float bias,
+    kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> lut1d(datatype_t type, const gsl::byte *input, const gsl::byte *table, gsl::byte *output, const runtime_shape_t &shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const scalar &min, const scalar &max) noexcept;
+
+NNCASE_API result<void> pad(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const runtime_paddings_t &paddings, pad_mode_t mode, const scalar &pad_value,
+    kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> quantize(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
+    const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, float scale, float bias,
+    kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> unary(unary_op_t op, const float *input, float *output, const runtime_shape_t &shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> reduce(reduce_op_t op, float init_value, const float *input, float *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> slice(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const runtime_shape_t &begins, const runtime_shape_t &ends, const runtime_axis_t &strides,
+    kernel_context &context = default_kernel_context) noexcept;
+
+END_NS_NNCASE_KERNELS_CPU_REF
--- a/lib/nncase/v1/include/nncase/kernels/k210/k210_kernels.h
+++ b/lib/nncase/v1/include/nncase/kernels/k210/k210_kernels.h
@ -0,0 +1,321 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <nncase/kernels/kernel_utils.h>
+#include <nncase/runtime/k210/compiler_defs.h>
+#include <nncase/runtime/k210/runtime_op_utility.h>
+#include <nncase/runtime/k210/runtime_types.h>
+#include <nncase/runtime/result.h>
+#include <nncase/runtime/runtime_op_utility.h>
+
+BEGIN_NS_NNCASE_KERNELS_K210
+
+namespace detail
+{
+template <class T>
+struct pool_partial_type;
+
+template <>
+struct pool_partial_type<uint8_t>
+{
+    using type = uint32_t;
+};
+
+template <>
+struct pool_partial_type<float>
+{
+    using type = float;
+};
+
+template <class T>
+using pool_partial_type_t = typename pool_partial_type<T>::type;
+}
+
+result<void> kpu_upload(const uint8_t *src, uint8_t *dest, const runtime::k210::kpu_shape_t &in_shape, uint32_t dma_ch);
+
+inline result<void> kpu_download(const uint8_t *src, uint8_t *dest, const runtime::k210::kpu_shape_t &in_shape)
+{
+    using namespace runtime::k210;
+
+    if (in_shape[3] % 64 == 0)
+    {
+        std::copy(src, src + kernels::detail::compute_size(in_shape), dest);
+    }
+    else
+    {
+        auto layout = get_kpu_row_layout(in_shape[3]);
+        auto fmap_size = get_kpu_bytes(in_shape[3], in_shape[2], in_shape[1]);
+
+        for (uint32_t batch = 0; batch < in_shape[0]; batch++)
+        {
+            auto batch_origin = src + (size_t)batch * fmap_size;
+            for (uint32_t oc = 0; oc < in_shape[1]; oc++)
+            {
+                auto channel_origin = batch_origin + (size_t)oc / layout.groups * layout.row_len * in_shape[2] * 64 + (size_t)oc % layout.groups * layout.row_pitch;
+                for (uint32_t y = 0; y < in_shape[2]; y++)
+                {
+                    auto y_origin = channel_origin + (size_t)y * layout.row_len * 64;
+                    for (uint32_t x = 0; x < in_shape[3]; x++)
+                        *dest++ = y_origin[x];
+                }
+            }
+        }
+    }
+
+    return ok();
+}
+
+template <bool IsDepthwise, int32_t FilterSize>
+void kpu_conv2d(const uint8_t *input, int64_t *workspace, uint8_t *output, const uint8_t *weights, int32_t in_h, int32_t in_w, int32_t in_channels, int32_t out_channels, uint8_t pad_value, int32_t arg_x,
+    int32_t shift_x, int32_t arg_w, int32_t shift_w, int64_t arg_add, const runtime::k210::kpu_batchnorm_segment *batchnorm, const runtime::k210::kpu_activation_table_t &activation)
+{
+    const auto channel_size = size_t(in_h) * in_w;
+    // conv
+    {
+        auto out_it = workspace;
+        const auto pad = FilterSize == 1 ? 0 : 1;
+        const auto groups = IsDepthwise ? out_channels : 1;
+        const auto g_ic = IsDepthwise ? 1 : in_channels / groups;
+        const auto g_oc = IsDepthwise ? 1 : out_channels;
+
+        for (int32_t og = 0; og < groups; og++)
+        {
+            const uint8_t *w_group_p = weights + (size_t)og * g_oc * g_ic * FilterSize * FilterSize;
+
+            for (int32_t oc = 0; oc < g_oc; oc++)
+            {
+                const uint8_t *w_oc_p = w_group_p + (size_t)oc * g_ic * FilterSize * FilterSize;
+
+                for (int32_t oy = 0; oy < in_h; oy++)
+                {
+                    for (int32_t ox = 0; ox < in_w; ox++)
+                    {
+                        const int32_t in_y_origin = oy - pad;
+                        const int32_t in_x_origin = ox - pad;
+                        int64_t value = 0;
+                        int64_t sum_x = 0, sum_w = 0;
+
+                        for (int32_t ic = 0; ic < g_ic; ic++)
+                        {
+                            const uint8_t *in_c_p = input + ((size_t)og * g_ic + ic) * in_h * in_w;
+                            const uint8_t *w_ic_p = w_oc_p + (size_t)ic * FilterSize * FilterSize;
+
+                            for (int32_t ky = 0; ky < FilterSize; ky++)
+                            {
+                                for (int32_t kx = 0; kx < FilterSize; kx++)
+                                {
+                                    const int32_t in_y = in_y_origin + ky;
+                                    const int32_t in_x = in_x_origin + kx;
+
+                                    uint8_t x;
+                                    if (in_x < 0 || in_x >= in_w
+                                        || in_y < 0 || in_y >= in_h)
+                                        x = pad_value;
+                                    else
+                                        x = in_c_p[in_y * in_w + in_x];
+
+                                    uint8_t w = w_ic_p[ky * FilterSize + kx];
+
+                                    sum_x += x;
+                                    sum_w += w;
+                                    value += (int32_t)x * w;
+                                }
+                            }
+                        }
+
+                        *out_it++ = value + (arg_x * sum_x >> shift_x) + (arg_w * sum_w >> shift_w) + arg_add * g_ic;
+                    }
+                }
+            }
+        }
+    }
+
+    // bn act
+    {
+        auto src_it = workspace;
+        auto out_it = output;
+        for (int32_t oc = 0; oc < out_channels; oc++)
+        {
+            const auto &bn = batchnorm[oc];
+            for (size_t i = 0; i < channel_size; i++)
+            {
+                auto value = (*src_it++ * bn.mul >> bn.shift) + bn.add;
+                auto &seg = *std::find_if(activation.rbegin(), activation.rend(), [value](const runtime::k210::kpu_activation_segment &seg) {
+                    return value > seg.start_x;
+                });
+                auto act_value = runtime::carry_shift<int64_t, true>((value - seg.start_x) * seg.mul, seg.shift) + seg.add;
+                *out_it++ = (uint8_t)kernels::detail::clamp(act_value, int64_t(0), int64_t(255));
+            }
+        }
+    }
+}
+
+template <class T>
+inline void kpu_pool2d(const T *input, T *output, int32_t in_h, int32_t in_w, int32_t in_channels, runtime::k210::kpu_pool_type_t pool_type)
+{
+    using namespace runtime::k210;
+    using partial_t = detail::pool_partial_type_t<T>;
+
+    const auto filter = get_kpu_filter_size(pool_type);
+    const auto stride = get_kpu_filter_stride(pool_type);
+    const auto out_h = get_kpu_pool_output_size(in_h, pool_type);
+    const auto out_w = get_kpu_pool_output_size(in_w, pool_type);
+
+    for (int32_t oc = 0; oc < in_channels; oc++)
+    {
+        auto in_c_p = input + (size_t)oc * in_h * in_w;
+
+        for (int32_t oy = 0; oy < out_h; oy++)
+        {
+            for (int32_t ox = 0; ox < out_w; ox++)
+            {
+                const int32_t in_y_origin = oy * stride;
+                const int32_t in_x_origin = ox * stride;
+                partial_t value = 0;
+
+                switch (pool_type)
+                {
+                case kpu_pool_bypass:
+                {
+                    const int32_t in_y = in_y_origin;
+                    const int32_t in_x = in_x_origin;
+
+                    value = in_c_p[in_y * in_w + in_x];
+                    break;
+                }
+                case kpu_pool_max_2_s2:
+                case kpu_pool_max_2_s1:
+                case kpu_pool_max_4_s4:
+                {
+                    value = std::numeric_limits<T>::lowest();
+                    for (int32_t ky = 0; ky < filter; ky++)
+                    {
+                        for (int32_t kx = 0; kx < filter; kx++)
+                        {
+                            const int32_t in_y = in_y_origin + ky;
+                            const int32_t in_x = in_x_origin + kx;
+                            partial_t in_v;
+
+                            if (in_y < 0 || in_y >= in_h || in_x < 0 || in_x >= in_w)
+                                in_v = std::numeric_limits<T>::lowest();
+                            else
+                                in_v = in_c_p[in_y * in_w + in_x];
+
+                            value = std::max(value, in_v);
+                        }
+                    }
+
+                    break;
+                }
+                case kpu_pool_mean_2_s2:
+                case kpu_pool_mean_2_s1:
+                case kpu_pool_mean_4_s4:
+                {
+                    for (int32_t ky = 0; ky < filter; ky++)
+                    {
+                        for (int32_t kx = 0; kx < filter; kx++)
+                        {
+                            const int32_t in_y = kernels::detail::clamp(in_y_origin + ky, 0, in_h - 1);
+                            const int32_t in_x = kernels::detail::clamp(in_x_origin + kx, 0, in_w - 1);
+                            const T in_v = in_c_p[in_y * in_w + in_x];
+
+                            value += in_v;
+                        }
+                    }
+
+                    value /= filter * filter;
+                    break;
+                }
+                case kpu_pool_left_top_2_s2:
+                case kpu_pool_left_top_4_s4:
+                case kpu_pool_right_top_2_s2:
+                {
+                    auto k_off = get_kpu_select_pool_offset(pool_type);
+                    const int32_t in_y = in_y_origin + k_off[0];
+                    const int32_t in_x = in_x_origin + k_off[1];
+                    partial_t in_v;
+
+                    if (in_y < 0 || in_y >= in_h || in_x < 0 || in_x >= in_w)
+                        in_v = 0;
+                    else
+                        in_v = in_c_p[in_y * in_w + in_x];
+
+                    value = in_v;
+                    break;
+                }
+                }
+
+                *output++ = (T)value;
+            }
+        }
+    }
+}
+
+template <bool IsDepthwise, int32_t FilterSize>
+void fake_kpu_conv2d(const float *input, float *output, const float *weights, const float *bias, int32_t in_h, int32_t in_w, int32_t in_channels, int32_t out_channels, value_range<float> fused_activation)
+{
+    const auto pad = FilterSize == 1 ? 0 : 1;
+    const auto groups = IsDepthwise ? out_channels : 1;
+    const auto g_ic = IsDepthwise ? 1 : in_channels / groups;
+    const auto g_oc = IsDepthwise ? 1 : out_channels;
+
+    for (int32_t og = 0; og < groups; og++)
+    {
+        const auto *w_group_p = weights + (size_t)og * g_oc * g_ic * FilterSize * FilterSize;
+
+        for (int32_t oc = 0; oc < g_oc; oc++)
+        {
+            const auto *w_oc_p = w_group_p + (size_t)oc * g_ic * FilterSize * FilterSize;
+
+            for (int32_t oy = 0; oy < in_h; oy++)
+            {
+                for (int32_t ox = 0; ox < in_w; ox++)
+                {
+                    const int32_t in_y_origin = oy - pad;
+                    const int32_t in_x_origin = ox - pad;
+                    const int32_t filter_y_start = std::max(0, -in_y_origin);
+                    const int32_t filter_y_end = std::min(FilterSize, in_h - in_y_origin);
+                    const int32_t filter_x_start = std::max(0, -in_x_origin);
+                    const int32_t filter_x_end = std::min(FilterSize, in_w - in_x_origin);
+                    float value = bias[og * g_oc + oc];
+
+                    for (int32_t ic = 0; ic < g_ic; ic++)
+                    {
+                        const auto *in_c_p = input + ((size_t)og * g_ic + ic) * in_h * in_w;
+                        const auto *w_ic_p = w_oc_p + (size_t)ic * FilterSize * FilterSize;
+
+                        for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
+                        {
+                            for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
+                            {
+                                const int32_t in_y = in_y_origin + ky;
+                                const int32_t in_x = in_x_origin + kx;
+
+                                const auto in_v = in_c_p[in_y * in_w + in_x];
+                                const auto w = w_ic_p[ky * FilterSize + kx];
+
+                                value += in_v * w;
+                            }
+                        }
+                    }
+
+                    *output++ = kernels::detail::apply_activation(value, fused_activation);
+                }
+            }
+        }
+    }
+}
+
+END_NS_NNCASE_KERNELS_K210
--- a/lib/nncase/v1/include/nncase/kernels/kernel_context.h
+++ b/lib/nncase/v1/include/nncase/kernels/kernel_context.h
@ -0,0 +1,27 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <nncase/runtime/result.h>
+
+BEGIN_NS_NNCASE_KERNELS
+
+struct NNCASE_API kernel_context
+{
+
+};
+
+NNCASE_UNUSED static kernel_context default_kernel_context;
+
+END_NS_NNCASE_KERNELS
--- a/lib/nncase/v1/include/nncase/kernels/kernel_utils.h
+++ b/lib/nncase/v1/include/nncase/kernels/kernel_utils.h
@ -0,0 +1,240 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <nncase/runtime/datatypes.h>
+#include <numeric>
+
+#ifdef __GNUC__
+#define CXX_RESTRICT __restrict__
+#elif _MSC_VER
+#define CXX_RESTRICT __restrict
+#else
+#define CXX_RESTRICT
+#endif
+
+BEGIN_NS_NNCASE_KERNELS
+
+template <class offset_type, class S, class It>
+inline offset_type element_offset(const S &strides, It first, It last) noexcept
+{
+    using difference_type = typename std::iterator_traits<It>::difference_type;
+    auto size = static_cast<difference_type>((std::min)(static_cast<typename S::size_type>(std::distance(first, last)), strides.size()));
+    return std::inner_product(last - size, last, strides.cend() - size, offset_type(0));
+}
+
+template <class TShape>
+size_t offset(const TShape &strides, const TShape &index)
+{
+    assert(strides.size() == index.size());
+    return element_offset<size_t>(strides, index.begin(), index.end());
+}
+
+template <class TShape>
+TShape reshape_linear_index(const TShape &new_shape, size_t index)
+{
+    TShape new_index(new_shape.size());
+    size_t i = new_shape.size() - 1;
+    for (auto it = new_shape.rbegin(); it != new_shape.rend(); ++it)
+    {
+        new_index[i--] = index % *it;
+        index /= *it;
+    }
+
+    return new_index;
+}
+
+template <class TShape>
+size_t linear_index(const TShape &shape, const TShape &index)
+{
+    assert(index.size() == shape.size());
+    size_t new_index = index[0];
+    for (size_t i = 1; i < shape.size(); i++)
+        new_index = new_index * shape[i] + index[i];
+    return new_index;
+}
+
+namespace detail
+{
+inline size_t get_windowed_output_size(size_t size, int32_t filter, int32_t stride, int32_t dilation, const padding &padding)
+{
+    auto effective_filter_size = (filter - 1) * dilation + 1;
+    return (size_t)((int32_t)size + padding.before + padding.after - effective_filter_size + stride) / stride;
+}
+
+inline runtime_shape_t get_binary_output_shape(const runtime_shape_t &input_a_shape, const runtime_shape_t &input_b_shape)
+{
+    runtime_shape_t out_shape;
+
+    const auto dest_dims = (int32_t)std::max(input_a_shape.size(), input_b_shape.size());
+    const auto in_a_ext = dest_dims - (int32_t)input_a_shape.size();
+    const auto in_b_ext = dest_dims - (int32_t)input_b_shape.size();
+
+    for (int32_t i = 0; i < dest_dims; i++)
+    {
+        const auto in_a_dim = i - (int32_t)in_a_ext;
+        const auto in_b_dim = i - (int32_t)in_b_ext;
+
+        const auto in_a = in_a_dim < 0 ? 1 : input_a_shape[in_a_dim];
+        const auto in_b = in_b_dim < 0 ? 1 : input_b_shape[in_b_dim];
+        if (in_a == in_b)
+            out_shape.push_back(in_a);
+        else if (in_a == 1)
+            out_shape.push_back(in_b);
+        else if (in_b == 1)
+            out_shape.push_back(in_a);
+        else
+            assert(!"inputs are not compatible to broadcast");
+    }
+
+    return out_shape;
+}
+
+template <class TShape>
+size_t compute_size(const TShape &shape)
+{
+    return std::accumulate(shape.begin(), shape.end(), size_t(1), std::multiplies<void>());
+}
+
+template <class T>
+inline T clamp(T value, T min, T max)
+{
+    return std::max(std::min(value, max), min);
+}
+
+template <class T>
+inline T apply_activation(T value, value_range<T> activation)
+{
+    return clamp(value, activation.min, activation.max);
+}
+
+template <class TShape>
+TShape get_reduced_offset(const TShape &in_offset, const TShape &reduced_shape)
+{
+    TShape off(reduced_shape.size());
+    const auto dims_ext = in_offset.size() - reduced_shape.size();
+    for (size_t i = 0; i < reduced_shape.size(); i++)
+    {
+        if (in_offset[i + dims_ext] >= reduced_shape[i])
+            off[i] = 0;
+        else
+            off[i] = in_offset[i + dims_ext];
+    }
+
+    return off;
+}
+
+template <class TShape>
+TShape get_reduced_shape(const TShape &in_shape, const TShape &axis, bool keep_dims)
+{
+    TShape shape;
+    shape.reserve(in_shape.size() - (keep_dims ? 0 : axis.size()));
+    for (size_t i = 0; i < in_shape.size(); i++)
+    {
+        if (std::find(axis.begin(), axis.end(), i) == axis.end())
+        {
+            shape.push_back(in_shape[i]);
+        }
+        else
+        {
+            if (keep_dims)
+                shape.push_back(1);
+        }
+    }
+
+    if (shape.empty())
+        shape.push_back(1);
+    return shape;
+}
+
+template <class TShape>
+size_t get_reduce_block_size(const TShape &in_shape, const TShape &axis)
+{
+    size_t size = 1;
+    for (size_t i = 0; i < in_shape.size(); i++)
+    {
+        if (std::find(axis.begin(), axis.end(), i) != axis.end())
+        {
+            size *= in_shape[i];
+        }
+    }
+
+    return size;
+}
+
+template <class TShape>
+TShape get_reduced_offset(const TShape &in_offset, const TShape &axis, bool keep_dims)
+{
+    TShape off;
+    off.reserve(in_offset.size() - (keep_dims ? 0 : axis.size()));
+    for (size_t i = 0; i < in_offset.size(); i++)
+    {
+        if (std::find(axis.begin(), axis.end(), i) == axis.end())
+        {
+            off.push_back(in_offset[i]);
+        }
+        else
+        {
+            if (keep_dims)
+                off.push_back(0);
+        }
+    }
+
+    if (off.empty())
+        off.push_back(0);
+    return off;
+}
+
+template <class T, class TRange>
+struct default_ptr_getter
+{
+    T *operator()(const TRange &range) const noexcept { return range; }
+};
+
+template <int32_t Bits>
+int32_t to_signed(uint32_t value)
+{
+    auto mask = uint32_t(1) << (Bits - 1);
+    if (Bits != 32 && (value & mask) != 0)
+    {
+        auto sign = 0xFFFFFFFF << Bits;
+        return (int)(value | sign);
+    }
+
+    return (int32_t)value;
+}
+
+template <int32_t Bits>
+int64_t to_signed(uint64_t value)
+{
+    auto mask = uint64_t(1) << (Bits - 1);
+    if ((value & mask) != 0)
+    {
+        auto sign = 0xFFFFFFFFFFFFFFFF << Bits;
+        return (int64_t)(value | sign);
+    }
+
+    return (int64_t)value;
+}
+
+template <class T>
+constexpr T quantize(float value, const quant_param_t &param) noexcept
+{
+    return (T)clamp((int32_t)lrintf(value / param.scale + param.zero_point), (int32_t)std::numeric_limits<T>::lowest(), (int32_t)std::numeric_limits<T>::max());
+}
+}
+END_NS_NNCASE_KERNELS
--- a/lib/nncase/v1/include/nncase/kernels/neutral/neutral_kernels.h
+++ b/lib/nncase/v1/include/nncase/kernels/neutral/neutral_kernels.h
@ -0,0 +1,795 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../kernel_utils.h"
+#include <cmath>
+#include <nncase/runtime/nnil.h>
+#include <nncase/runtime/runtime_op_utility.h>
+#include <xtl/xspan.hpp>
+#ifdef __riscv
+#include "../riscv/neutral_kernels.h"
+#endif
+
+namespace nncase::kernels::neutral
+{
+template <class TOp, class TShape>
+void binary(const float *input_a, const float *input_b, float *output, const TShape &in_a_shape,
+    const TShape &in_b_shape, const TShape &out_shape, const value_range<float> &fused_activation, TOp &&op)
+{
+    // opt. no broadcast
+    if (in_a_shape == in_b_shape)
+    {
+        auto size = kernels::detail::compute_size(in_a_shape);
+        for (size_t i = 0; i < size; i++)
+        {
+            const auto a = input_a[i];
+            const auto b = input_b[i];
+            output[i] = kernels::detail::apply_activation(op(a, b), fused_activation);
+        }
+    }
+    // fallback
+    else
+    {
+        for (size_t d0 = 0; d0 < out_shape[0]; d0++)
+        {
+            for (size_t d1 = 0; d1 < out_shape[1]; d1++)
+            {
+                for (size_t d2 = 0; d2 < out_shape[2]; d2++)
+                {
+                    for (size_t d3 = 0; d3 < out_shape[3]; d3++)
+                    {
+                        TShape in_off = { d0, d1, d2, d3 };
+                        const auto in_a_off = kernels::detail::get_reduced_offset(in_off, in_a_shape);
+                        const auto in_b_off = kernels::detail::get_reduced_offset(in_off, in_b_shape);
+                        const auto a = input_a[offset(in_a_shape, in_a_off)];
+                        const auto b = input_b[offset(in_b_shape, in_b_off)];
+                        output[offset(out_shape, in_off)] = kernels::detail::apply_activation(op(a, b), fused_activation);
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <class TOp, class TShape>
+void quantized_binary(const uint8_t *input_a, const uint8_t *input_b, uint8_t *output, const TShape &in_a_shape,
+    const TShape &in_b_shape, const TShape &out_shape, int32_t input_a_offset, int32_t input_a_mul, int32_t input_a_shift,
+    int32_t input_b_offset, int32_t input_b_mul, int32_t input_b_shift, int32_t output_mul, int32_t output_shift, int32_t output_offset, TOp &&op)
+{
+    // opt. no broadcast
+    if (in_a_shape == in_b_shape)
+    {
+        auto size = kernels::detail::compute_size(in_a_shape);
+        for (size_t i = 0; i < size; i++)
+        {
+            auto a = (int32_t)input_a[i];
+            auto b = (int32_t)input_b[i];
+            a = runtime::mul_and_carry_shift(a + input_a_offset, input_a_mul, input_a_shift);
+            b = runtime::mul_and_carry_shift(b + input_b_offset, input_b_mul, input_b_shift);
+
+            auto output_val = runtime::mul_and_carry_shift(op(a, b), output_mul, output_shift);
+            output[i] = (uint8_t)std::clamp(output_val + output_offset, 0, 255);
+        }
+    }
+    // fallback
+    else
+    {
+        for (int32_t d0 = 0; d0 < out_shape[0]; d0++)
+        {
+            for (int32_t d1 = 0; d1 < out_shape[1]; d1++)
+            {
+                for (int32_t d2 = 0; d2 < out_shape[2]; d2++)
+                {
+                    for (int32_t d3 = 0; d3 < out_shape[3]; d3++)
+                    {
+                        TShape in_off = { d0, d1, d2, d3 };
+                        const auto in_a_off = kernels::detail::get_reduced_offset(in_off, in_a_shape);
+                        const auto in_b_off = kernels::detail::get_reduced_offset(in_off, in_b_shape);
+                        auto a = (int32_t)input_a[offset(in_a_shape, in_a_off)];
+                        auto b = (int32_t)input_b[offset(in_b_shape, in_b_off)];
+                        a = runtime::mul_and_carry_shift(a + input_a_offset, input_a_mul, input_a_shift);
+                        b = runtime::mul_and_carry_shift(b + input_b_offset, input_b_mul, input_b_shift);
+
+                        auto output_val = runtime::mul_and_carry_shift(op(a, b), output_mul, output_shift);
+                        output[offset(out_shape, in_off)] = (uint8_t)std::clamp(output_val + output_offset, 0, 255);
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <class TRange, class TPtrGetter = detail::default_ptr_getter<uint8_t, TRange>>
+inline void concat(xtl::span<TRange> inputs, uint8_t *output, xtl::span<const int32_t> concat_dims, size_t inner_size, size_t outer_size, TPtrGetter getter = {})
+{
+    for (size_t oc = 0; oc < outer_size; oc++)
+    {
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            auto size = inner_size * concat_dims[i];
+            auto src = getter(inputs[i]) + oc * size;
+            std::copy(src, src + size, output);
+            output += size;
+        }
+    }
+}
+
+template <class TShape>
+void conv2d(const float *input, float *output, const float *weights, const float *bias, const TShape &in_shape,
+    int32_t groups, int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
+    const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
+{
+    const auto out_h = detail::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
+    const auto out_w = detail::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
+    const auto g_ic = in_shape[1] / groups;
+    const auto g_oc = (size_t)out_channels / groups;
+
+    for (size_t batch = 0; batch < in_shape[0]; batch++)
+    {
+        const float *in_batch_p = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
+
+        for (size_t og = 0; og < (size_t)groups; og++)
+        {
+            const float *in_group_p = in_batch_p + (size_t)og * g_ic * in_shape[2] * in_shape[3];
+            const float *w_group_p = weights + (size_t)og * g_oc * g_ic * filter_h * filter_w;
+
+            for (size_t oc = 0; oc < g_oc; oc++)
+            {
+                const float *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
+
+                for (size_t oy = 0; oy < out_h; oy++)
+                {
+                    for (size_t ox = 0; ox < out_w; ox++)
+                    {
+                        const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
+                        const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
+                        const size_t filter_y_start = (size_t)std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
+                        const size_t filter_y_end = (size_t)std::min(filter_h, ((int32_t)in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
+                        const size_t filter_x_start = (size_t)std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
+                        const size_t filter_x_end = (size_t)std::min(filter_w, ((int32_t)in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
+                        float value = bias[og * g_oc + oc];
+
+                        for (size_t ic = 0; ic < g_ic; ic++)
+                        {
+                            const float *in_c_p = in_group_p + (size_t)ic * in_shape[2] * in_shape[3];
+                            const float *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
+
+                            for (size_t ky = filter_y_start; ky < filter_y_end; ky++)
+                            {
+                                for (size_t kx = filter_x_start; kx < filter_x_end; kx++)
+                                {
+                                    const size_t in_y = in_y_origin + dilation_h * ky;
+                                    const size_t in_x = in_x_origin + dilation_w * kx;
+
+                                    const float in_v = in_c_p[in_y * in_shape[3] + in_x];
+                                    const float w = w_ic_p[ky * filter_w + kx];
+
+                                    value += in_v * w;
+                                }
+                            }
+                        }
+
+                        *output++ = detail::apply_activation(value, fused_activation);
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <class TShape>
+void quantized_conv2d(const uint8_t *input, uint8_t *output, const uint8_t *weights, const int32_t *bias, int32_t input_offset, int32_t filter_offset,
+    int32_t output_mul, int32_t output_shift, int32_t output_offset, const TShape &in_shape, int32_t groups, int32_t out_channels,
+    int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
+    const padding &padding_h, const padding &padding_w)
+{
+    const auto out_h = detail::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
+    const auto out_w = detail::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
+    const auto g_ic = in_shape[1] / groups;
+    const auto g_oc = out_channels / groups;
+
+    for (int32_t batch = 0; batch < in_shape[0]; batch++)
+    {
+        const uint8_t *in_batch_p = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
+
+        for (int32_t og = 0; og < groups; og++)
+        {
+            const uint8_t *in_group_p = in_batch_p + (size_t)og * g_ic * in_shape[2] * in_shape[3];
+            const uint8_t *w_group_p = weights + (size_t)og * g_oc * g_ic * filter_h * filter_w;
+
+            for (int32_t oc = 0; oc < g_oc; oc++)
+            {
+                const uint8_t *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
+
+                for (int32_t oy = 0; oy < out_h; oy++)
+                {
+                    for (int32_t ox = 0; ox < out_w; ox++)
+                    {
+                        const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
+                        const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
+                        const int32_t filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
+                        const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
+                        const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
+                        const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
+                        int32_t value = bias[og * g_oc + oc];
+
+                        for (int32_t ic = 0; ic < g_ic; ic++)
+                        {
+                            const uint8_t *in_c_p = in_group_p + (size_t)ic * in_shape[2] * in_shape[3];
+                            const uint8_t *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
+
+                            for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
+                            {
+                                for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
+                                {
+                                    const int32_t in_y = in_y_origin + dilation_h * ky;
+                                    const int32_t in_x = in_x_origin + dilation_w * kx;
+
+                                    const int32_t in_v = (int32_t)in_c_p[in_y * in_shape[3] + in_x] + input_offset;
+                                    const int32_t w = (int32_t)w_ic_p[ky * filter_w + kx] + filter_offset;
+
+                                    value += in_v * w;
+                                }
+                            }
+                        }
+
+                        auto output_val = static_cast<int32_t>(runtime::mul_and_carry_shift(value, output_mul, output_shift));
+                        output_val += output_offset;
+                        *output++ = (uint8_t)std::clamp(output_val, 0, 255);
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <class TShape>
+void conv2d_transpose(const float *input, float *output, const float *weights, [[maybe_unused]] const float *bias, const TShape &in_shape,
+    int32_t groups, const TShape &out_shape, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
+    const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
+{
+    std::fill(output, output + kernels::detail::compute_size(out_shape), 0.f);
+    const auto g_ic = in_shape[1] / groups;
+    const auto g_oc = out_shape[1] / groups;
+
+    for (size_t batch = 0; batch < in_shape[0]; batch++)
+    {
+        float *out_batch_p = output + (size_t)batch * out_shape[1] * out_shape[2] * out_shape[3];
+
+        for (size_t g = 0; g < (size_t)groups; g++)
+        {
+            float *out_group_p = out_batch_p + (size_t)g * g_oc * out_shape[2] * out_shape[3];
+            const float *w_group_p = weights + (size_t)g * g_oc * g_ic * filter_h * filter_w;
+
+            for (size_t ic = 0; ic < g_ic; ic++)
+            {
+                for (size_t iy = 0; iy < in_shape[2]; iy++)
+                {
+                    for (size_t ix = 0; ix < in_shape[3]; ix++)
+                    {
+                        const int32_t out_y_origin = (iy * stride_h) - padding_h.before;
+                        const int32_t out_x_origin = (ix * stride_w) - padding_w.before;
+                        const size_t filter_y_start = (size_t)std::max(0, (-out_y_origin + dilation_h - 1) / dilation_h);
+                        const size_t filter_y_end = (size_t)std::min(filter_h, ((int32_t)out_shape[2] - out_y_origin + dilation_h - 1) / dilation_h);
+                        const size_t filter_x_start = (size_t)std::max(0, (-out_x_origin + dilation_w - 1) / dilation_w);
+                        const size_t filter_x_end = (size_t)std::min(filter_w, ((int32_t)out_shape[3] - out_x_origin + dilation_w - 1) / dilation_w);
+                        const float in_v = *input++;
+
+                        for (size_t oc = 0; oc < g_oc; oc++)
+                        {
+                            assert(bias[g * g_oc + oc] == 0.f);
+                            float *out_c_p = out_group_p + (size_t)oc * out_shape[2] * out_shape[3];
+                            const float *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
+                            const float *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
+
+                            for (size_t ky = filter_y_start; ky < filter_y_end; ky++)
+                            {
+                                for (size_t kx = filter_x_start; kx < filter_x_end; kx++)
+                                {
+                                    const int32_t out_y = out_y_origin + dilation_h * ky;
+                                    const int32_t out_x = out_x_origin + dilation_w * kx;
+
+                                    const float w = w_ic_p[ky * filter_w + kx];
+
+                                    out_c_p[out_y * out_shape[3] + out_x] += in_v * w;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if (fused_activation != value_range<float>::full())
+    {
+        for (size_t i = 0; i < kernels::detail::compute_size(out_shape); i++)
+            output[i] = detail::apply_activation(output[i], fused_activation);
+    }
+}
+
+template <class TQ>
+void dequantize(const TQ *CXX_RESTRICT input, float *CXX_RESTRICT output, size_t count, const quant_param_t &param)
+{
+#if __riscv
+    riscv_dequantize(input, output, count, param);
+#else
+    for (size_t i = 0; i < count; i++)
+    {
+        output[i] = (input[i] - param.zero_point) * param.scale;
+    }
+
+#endif
+}
+
+inline void matmul(const float *input_a, const float *input_b, float *output, const float *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, const value_range<float> &fused_activation)
+{
+    for (int32_t oy = 0; oy < a_rows; oy++)
+    {
+        for (int32_t ox = 0; ox < b_cols; ox++)
+        {
+            float value = bias[ox];
+
+            for (int32_t i = 0; i < a_cols; i++)
+            {
+                const auto a = input_a[oy * a_cols + i];
+                const auto b = input_b[i * b_cols + ox];
+                value += a * b;
+            }
+
+            output[oy * b_cols + ox] = detail::apply_activation(value, fused_activation);
+        }
+    }
+}
+
+inline void quantized_matmul(const uint8_t *input_a, const uint8_t *input_b, uint8_t *output, const int32_t *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, int32_t input_a_offset, int32_t input_b_offset,
+    int32_t output_mul, int32_t output_shift, int32_t output_offset)
+{
+    for (int32_t oy = 0; oy < a_rows; oy++)
+    {
+        for (int32_t ox = 0; ox < b_cols; ox++)
+        {
+            int32_t value = bias[ox];
+            for (int32_t i = 0; i < a_cols; i++)
+            {
+                const auto a = (int32_t)input_a[oy * a_cols + i] + input_a_offset;
+                const auto b = (int32_t)input_b[i * b_cols + ox] + input_b_offset;
+                value += a * b;
+            }
+
+            auto output_val = static_cast<int32_t>(runtime::mul_and_carry_shift(value, output_mul, output_shift));
+            output_val += output_offset;
+            output[oy * b_cols + ox] = (uint8_t)std::clamp(output_val, 0, 255);
+        }
+    }
+}
+
+template <class T, class TShape, class TPaddings>
+void pad(const T *input, T *output, const TShape &in_shape, const TPaddings &paddings, T pad_value)
+{
+    TShape out_shape = { in_shape[0] + paddings[0].sum(),
+        in_shape[1] + paddings[1].sum(),
+        in_shape[2] + paddings[2].sum(),
+        in_shape[3] + paddings[3].sum() };
+
+    for (int d0 = 0; d0 < out_shape[0]; d0++)
+    {
+        auto d0_origin = -paddings[0].before;
+        auto in0 = input + ((size_t)d0_origin + d0) * in_shape[1] * in_shape[2] * in_shape[3];
+
+        for (int d1 = 0; d1 < out_shape[1]; d1++)
+        {
+            auto d1_origin = -paddings[1].before;
+            auto in1 = in0 + ((size_t)d1_origin + d1) * in_shape[2] * in_shape[3];
+
+            for (int d2 = 0; d2 < out_shape[2]; d2++)
+            {
+                auto d2_origin = -paddings[2].before;
+                auto in2 = in1 + ((size_t)d2_origin + d2) * in_shape[3];
+
+                for (int d3 = 0; d3 < out_shape[3]; d3++)
+                {
+                    auto d3_origin = -paddings[3].before;
+
+                    if (d0 < paddings[0].before || d0 >= out_shape[0] - paddings[0].after
+                        || d1 < paddings[1].before || d1 >= out_shape[1] - paddings[1].after
+                        || d2 < paddings[2].before || d2 >= out_shape[2] - paddings[2].after
+                        || d3 < paddings[3].before || d3 >= out_shape[3] - paddings[3].after)
+                        *output++ = pad_value;
+                    else
+                        *output++ = in2[d3_origin + d3];
+                }
+            }
+        }
+    }
+}
+
+template <class TQ>
+void quantize(const float *CXX_RESTRICT input, TQ *CXX_RESTRICT output, size_t count, const quant_param_t &param)
+{
+#if __riscv
+    riscv_quantize(input, output, count, param);
+#else
+    for (size_t i = 0; i < count; i++)
+    {
+        auto v = (int32_t)std::nearbyintf(input[i] / param.scale + param.zero_point);
+        output[i] = (TQ)std::clamp(v, (int32_t)std::numeric_limits<TQ>::lowest(), (int32_t)std::numeric_limits<TQ>::max());
+    }
+#endif
+}
+
+template <class TReducer, class TShape>
+void reduce(const float *input, float *output, float init_value, const TShape &in_shape, const TShape &reduced_shape, TReducer &&reducer)
+{
+    std::fill(output, output + kernels::detail::compute_size(reduced_shape), init_value);
+
+    for (size_t d0 = 0; d0 < in_shape[0]; d0++)
+    {
+        for (size_t d1 = 0; d1 < in_shape[1]; d1++)
+        {
+            for (size_t d2 = 0; d2 < in_shape[2]; d2++)
+            {
+                for (size_t d3 = 0; d3 < in_shape[3]; d3++)
+                {
+                    runtime_shape_t in_off = { d0, d1, d2, d3 };
+                    auto out_off = kernels::detail::get_reduced_offset(in_off, reduced_shape);
+                    const auto a = input[offset(in_shape, in_off)];
+                    auto &b = output[offset(reduced_shape, out_off)];
+                    b = reducer(b, a);
+                }
+            }
+        }
+    }
+}
+
+template <class TOp>
+void unary(const float *CXX_RESTRICT input, float *CXX_RESTRICT output, size_t count, TOp &&op)
+{
+    for (size_t i = 0; i < count; i++)
+        output[i] = op(input[i]);
+}
+
+template <class TBinaryOp, class TOutputOp, class TShape>
+void reduce_window2d(const float *input, float *output, float init_value, const TShape &in_shape, int32_t filter_h, int32_t filter_w,
+    int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, const padding &padding_h, const padding &padding_w,
+    const value_range<float> &fused_activation, TBinaryOp &&binary_op, TOutputOp &&window_op)
+{
+    const auto out_h = kernels::detail::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
+    const auto out_w = kernels::detail::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
+    runtime_shape_t out_shape { in_shape[0], in_shape[1], out_h, out_w };
+
+    for (size_t batch = 0; batch < in_shape[0]; batch++)
+    {
+        for (size_t oc = 0; oc < in_shape[1]; oc++)
+        {
+            for (size_t oy = 0; oy < out_h; oy++)
+            {
+                for (size_t ox = 0; ox < out_w; ox++)
+                {
+                    const int32_t in_y_origin = ((int32_t)oy * stride_h) - padding_h.before;
+                    const int32_t in_x_origin = ((int32_t)ox * stride_w) - padding_w.before;
+                    const size_t filter_y_start = (size_t)std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
+                    const size_t filter_y_end = (size_t)std::min(filter_h, ((int32_t)in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
+                    const size_t filter_x_start = (size_t)std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
+                    const size_t filter_x_end = (size_t)std::min(filter_w, ((int32_t)in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
+                    float value = init_value;
+                    int32_t kernel_count = 0;
+
+                    for (size_t ky = filter_y_start; ky < filter_y_end; ky++)
+                    {
+                        for (size_t kx = filter_x_start; kx < filter_x_end; kx++)
+                        {
+                            const size_t in_y = in_y_origin + dilation_h * ky;
+                            const size_t in_x = in_x_origin + dilation_w * kx;
+
+                            const float in_v = input[offset(in_shape, { batch, oc, in_y, in_x })];
+
+                            value = binary_op(value, in_v);
+                            kernel_count++;
+                        }
+                    }
+
+                    output[offset(out_shape, { batch, oc, oy, ox })] = kernels::detail::apply_activation(window_op(value, kernel_count), fused_activation);
+                }
+            }
+        }
+    }
+}
+
+template <class T, class TShape>
+void resize_nearest_neighbor(const T *input, T *output, const TShape &in_shape, int32_t out_h, int32_t out_w)
+{
+    auto height_scale = (float)in_shape[2] / out_h;
+    auto width_scale = (float)in_shape[3] / out_w;
+
+    for (size_t batch = 0; batch < in_shape[0]; batch++)
+    {
+        auto in_batch = input + batch * in_shape[1] * in_shape[2] * in_shape[3];
+
+        for (size_t oc = 0; oc < in_shape[1]; oc++)
+        {
+            auto in_c = in_batch + oc * in_shape[2] * in_shape[3];
+
+            for (size_t oy = 0; oy < (size_t)out_h; oy++)
+            {
+                auto in_y = std::min((size_t)floorf(oy * height_scale), in_shape[2] - 1);
+                auto in_row = in_c + in_y * in_shape[3];
+
+                for (size_t ox = 0; ox < (size_t)out_w; ox++)
+                {
+                    auto in_x = std::min((size_t)floorf(ox * width_scale), in_shape[3] - 1);
+                    *output++ = in_row[in_x];
+                }
+            }
+        }
+    }
+}
+
+template <class T, class TShape>
+inline void resize_bilinear(const T *input, T *output, const TShape &in_shape, int32_t out_h, int32_t out_w, bool align_corners)
+{
+    auto height_scale = (float)in_shape[2] / out_h;
+    auto width_scale = (float)in_shape[3] / out_w;
+    if (align_corners && out_h > 1)
+        height_scale = (float)(in_shape[2] - 1) / (out_h - 1);
+    if (align_corners && out_w > 1)
+        width_scale = (float)(in_shape[3] - 1) / (out_w - 1);
+
+    auto destIdx = 0;
+    for (size_t batch = 0; batch < in_shape[0]; batch++)
+    {
+        auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
+
+        for (size_t oc = 0; oc < in_shape[1]; oc++)
+        {
+            auto in_c = in_batch + (size_t)oc * in_shape[2] * in_shape[3];
+
+            for (size_t oy = 0; oy < (size_t)out_h; oy++)
+            {
+                auto in_y = oy * height_scale;
+                auto in_y0 = (size_t)floorf(in_y);
+                auto in_y1 = std::min(in_y0 + 1, in_shape[2] - 1);
+
+                for (size_t ox = 0; ox < (size_t)out_w; ox++)
+                {
+                    auto in_x = ox * width_scale;
+                    auto in_x0 = (size_t)floorf(in_x);
+                    auto in_x1 = std::min(in_x0 + 1, in_shape[3] - 1);
+
+                    auto v0 = in_c[in_y0 * in_shape[3] + in_x0];
+                    auto v1 = in_c[in_y1 * in_shape[3] + in_x0];
+                    auto v2 = in_c[in_y0 * in_shape[3] + in_x1];
+                    auto v3 = in_c[in_y1 * in_shape[3] + in_x1];
+
+                    auto a0 = (1 - (in_y - in_y0)) * (1 - (in_x - in_x0));
+                    auto a1 = (in_y - in_y0) * (1 - (in_x - in_x0));
+                    auto a2 = (1 - (in_y - in_y0)) * (in_x - in_x0);
+                    auto a3 = (in_y - in_y0) * (in_x - in_x0);
+
+                    output[destIdx++] = T(v0 * a0 + v1 * a1 + v2 * a2 + v3 * a3);
+                }
+            }
+        }
+    }
+}
+
+inline void softmax(const float *input, float *output, float beta, int32_t outer_size, size_t inner_size)
+{
+    for (int32_t batch = 0; batch < outer_size; batch++)
+    {
+        auto src = input + batch * inner_size;
+        auto dest = output + batch * inner_size;
+
+        auto max = *std::max_element(src, src + inner_size);
+        float sum = 0;
+
+        for (size_t i = 0; i < inner_size; i++)
+        {
+            auto value = expf((src[i] - max) * beta);
+            sum += value;
+            dest[i] = value;
+        }
+
+        for (size_t i = 0; i < inner_size; i++)
+            dest[i] /= sum;
+    }
+}
+
+template <class T, class TShape>
+void transpose(const T *CXX_RESTRICT input, T *CXX_RESTRICT output, const TShape &in_shape, const TShape &in_strides, const TShape &out_strides, const TShape &perm)
+{
+    runtime_shape_t out_shape(in_shape.size());
+    for (size_t i = 0; i < in_shape.size(); i++)
+        out_shape[i] = in_shape[perm[i]];
+
+    runtime_shape_t i(4), o(4);
+    for (o[3] = 0; o[3] < out_shape[3]; o[3]++)
+    {
+        i[perm[3]] = o[3];
+        for (o[2] = 0; o[2] < out_shape[2]; o[2]++)
+        {
+            i[perm[2]] = o[2];
+            for (o[1] = 0; o[1] < out_shape[1]; o[1]++)
+            {
+                i[perm[1]] = o[1];
+                for (o[0] = 0; o[0] < out_shape[0]; o[0]++)
+                {
+                    i[perm[0]] = o[0];
+                    output[offset(out_strides, o)] = input[offset(in_strides, i)];
+                }
+            }
+        }
+    }
+}
+
+template <class T, class TShape>
+void strided_slice(const T *CXX_RESTRICT input, T *CXX_RESTRICT output, const TShape &in_shape, const TShape &begin, const TShape &end, const TShape &strides)
+{
+    auto loop_cond = [](int32_t i, int32_t stop, int32_t stride) {
+        return stride > 0 ? i < stop : i > stop;
+    };
+
+    for (int32_t d0 = begin[0]; loop_cond(d0, end[0], strides[0]); d0 += strides[0])
+    {
+        auto d0_origin = input + (size_t)d0 * in_shape[1] * in_shape[2] * in_shape[3];
+        for (int d1 = begin[1]; loop_cond(d1, end[1], strides[1]); d1 += strides[1])
+        {
+            auto d1_origin = d0_origin + (size_t)d1 * in_shape[2] * in_shape[3];
+            for (int32_t d2 = begin[2]; loop_cond(d2, end[2], strides[2]); d2 += strides[2])
+            {
+                auto d2_origin = d1_origin + (size_t)d2 * in_shape[3];
+                for (int32_t d3 = begin[3]; loop_cond(d3, end[3], strides[3]); d3 += strides[3])
+                    *output++ = d2_origin[d3];
+            }
+        }
+    }
+}
+
+inline void nnil_unary_method(const float *input, float *output, size_t count, gsl::span<const gsl::byte> body)
+{
+    using namespace nncase::runtime;
+
+    for (size_t i = 0; i < count; i++)
+    {
+        nnil_evalstack stack;
+        span_reader sr(body);
+        nnil_reader reader(sr);
+        bool ret = false;
+
+        while (reader.avail() && !ret)
+        {
+            auto op = reader.next();
+            switch (op.opcode)
+            {
+            case nnil_nop:
+                break;
+            case nnil_dup:
+                stack.dup();
+                break;
+            case nnil_pop:
+                stack.pop();
+                break;
+            case nnil_lda_0:
+                stack.push(input[i]);
+                break;
+            case nnil_ldc_r4_0:
+                stack.push(0.f);
+                break;
+            case nnil_ldc_r4_1:
+                stack.push(1.f);
+                break;
+            case nnil_ldc_r4:
+                stack.push(op.ldc_r4.r4);
+                break;
+            case nnil_abs:
+                stack.push(fabsf(stack.pop()));
+                break;
+            case nnil_ceil:
+                stack.push(ceilf(stack.pop()));
+                break;
+            case nnil_cos:
+                stack.push(cosf(stack.pop()));
+                break;
+            case nnil_exp:
+                stack.push(expf(stack.pop()));
+                break;
+            case nnil_floor:
+                stack.push(floorf(stack.pop()));
+                break;
+            case nnil_log:
+                stack.push(logf(stack.pop()));
+                break;
+            case nnil_neg:
+                stack.push(-stack.pop());
+                break;
+            case nnil_rsqrt:
+                stack.push(1.f / sqrtf(stack.pop()));
+                break;
+            case nnil_sin:
+                stack.push(sinf(stack.pop()));
+                break;
+            case nnil_square:
+            {
+                auto v = stack.pop();
+                stack.push(v * v);
+                break;
+            }
+            case nnil_add:
+            {
+                auto b = stack.pop();
+                auto a = stack.pop();
+                stack.push(a + b);
+                break;
+            }
+            case nnil_sub:
+            {
+                auto b = stack.pop();
+                auto a = stack.pop();
+                stack.push(a - b);
+                break;
+            }
+            case nnil_mul:
+            {
+                auto b = stack.pop();
+                auto a = stack.pop();
+                stack.push(a * b);
+                break;
+            }
+            case nnil_div:
+            {
+                auto b = stack.pop();
+                auto a = stack.pop();
+                stack.push(a / b);
+                break;
+            }
+            case nnil_min:
+            {
+                auto b = stack.pop();
+                auto a = stack.pop();
+                stack.push(std::min(a, b));
+                break;
+            }
+            case nnil_max:
+            {
+                auto b = stack.pop();
+                auto a = stack.pop();
+                stack.push(std::max(a, b));
+                break;
+            }
+            case nnil_clamp:
+            {
+                auto high = stack.pop();
+                auto low = stack.pop();
+                auto v = stack.pop();
+                stack.push(std::clamp(v, low, high));
+                break;
+            }
+            case nnil_ret:
+                output[i] = stack.pop();
+                ret = true;
+                break;
+            default:
+                throw std::runtime_error("Invalid nnil op");
+            }
+        }
+    }
+}
+
+inline void table_lookup1d(const uint8_t *CXX_RESTRICT input, uint8_t *CXX_RESTRICT output, size_t size, const uint8_t *CXX_RESTRICT table)
+{
+    for (size_t i = 0; i < size; i++)
+        output[i] = table[input[i]];
+}
+}
--- a/lib/nncase/v1/include/nncase/kernels/nnil.h
+++ b/lib/nncase/v1/include/nncase/kernels/nnil.h
@ -0,0 +1,25 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <nncase/runtime/datatypes.h>
+#include <nncase/runtime/error.h>
+#include <nncase/runtime/result.h>
+#include <nncase/kernels/kernel_context.h>
+
+BEGIN_NS_NNCASE_KERNELS
+
+NNCASE_API result<void> nnil_unary_method(const float *input, float *output, size_t count, gsl::span<const gsl::byte> body, kernel_context &context = default_kernel_context) noexcept;
+
+END_NS_NNCASE_KERNELS
--- a/lib/nncase/v1/include/nncase/kernels/reduce_window.h
+++ b/lib/nncase/v1/include/nncase/kernels/reduce_window.h
@ -0,0 +1,26 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <nncase/runtime/datatypes.h>
+#include <nncase/runtime/error.h>
+#include <nncase/runtime/result.h>
+
+BEGIN_NS_NNCASE_KERNELS
+
+NNCASE_API result<void> reduce_window2d(reduce_op_t op, const float *input, float init_value, float *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const padding &padding_h, const padding &padding_w,
+    int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, value_range<float> fused_activation, kernel_context &context = default_kernel_context) noexcept;
+
+END_NS_NNCASE_KERNELS
--- a/lib/nncase/v1/include/nncase/kernels/riscv/neutral_kernels.h
+++ b/lib/nncase/v1/include/nncase/kernels/riscv/neutral_kernels.h
@ -0,0 +1,83 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../kernel_utils.h"
+#include <cmath>
+#include <runtime/runtime_op_utility.h>
+#include <xtl/xspan.hpp>
+
+namespace nncase
+{
+namespace kernels
+{
+    namespace neutral
+    {
+        template <class TQ>
+        void riscv_dequantize(const TQ *CXX_RESTRICT input, float *CXX_RESTRICT output, size_t count, const quant_param_t &param)
+        {
+            float scale = 1.f / param.scale;
+            float zero = -param.zero_point * scale;
+
+            for (size_t i = 0; i < count / 2; i++)
+            {
+                // handwritten pipeline for in order CPU
+                auto in1_q = input[i * 2];
+                auto in2_q = input[i * 2 + 1];
+                auto in1 = (float)in1_q;
+                auto in2 = (float)in2_q;
+                auto out1 = in1 * scale + zero;
+                auto out2 = in2 * scale + zero;
+
+                output[i * 2] = out1;
+                output[i * 2 + 1] = out2;
+            }
+
+            if (count % 2)
+                output[count - 1] = input[count - 1] * scale + zero;
+        }
+
+        template <class TQ>
+        void riscv_quantize(const float *CXX_RESTRICT input, TQ *CXX_RESTRICT output, size_t count, const quant_param_t &param)
+        {
+            float scale = param.scale;
+            float zero = param.zero_point;
+
+            for (size_t i = 0; i < count / 2; i++)
+            {
+                auto in1 = input[i * 2];
+                auto in2 = input[i * 2 + 1];
+                in1 = in1 * scale + zero;
+                in2 = in2 * scale + zero;
+                int32_t out1, out2;
+                asm volatile("fcvt.w.s %0, %1, rne"
+                             : "=r"(out1)
+                             : "f"(in1));
+                asm volatile("fcvt.w.s %0, %1, rne"
+                             : "=r"(out2)
+                             : "f"(in2));
+
+                output[i * 2] = std::clamp(out1, (int32_t)std::numeric_limits<TQ>::lowest(), (int32_t)std::numeric_limits<TQ>::max());
+                output[i * 2 + 1] = std::clamp(out2, (int32_t)std::numeric_limits<TQ>::lowest(), (int32_t)std::numeric_limits<TQ>::max());
+            }
+
+            if (count % 2)
+            {
+                auto in = (int32_t)roundf(input[count - 1] * scale + zero);
+                output[count - 1] = std::clamp(in, (int32_t)std::numeric_limits<TQ>::lowest(), (int32_t)std::numeric_limits<TQ>::max());
+            }
+        }
+    }
+}
+}
--- a/lib/nncase/v1/include/nncase/kernels/tensor_compute.h
+++ b/lib/nncase/v1/include/nncase/kernels/tensor_compute.h
@ -0,0 +1,72 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <nncase/runtime/datatypes.h>
+#include <nncase/runtime/error.h>
+#include <nncase/runtime/result.h>
+#include <nncase/kernels/kernel_context.h>
+
+BEGIN_NS_NNCASE_KERNELS
+
+NNCASE_API result<void> batch_to_space(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &block_shape, const runtime_paddings_t &crops, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, 
+    kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> broadcast(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_shape, const runtime_shape_t &out_strides, kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> concat(datatype_t type, gsl::span<const gsl::byte *const> inputs, gsl::byte *output, const runtime_shape_t &out_shape,
+    gsl::span<const runtime_shape_t> in_strides, const runtime_shape_t &out_strides, size_t axis, const runtime_shape_t &concat_dims,
+    kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> convert(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
+    const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> copy(datatype_t type, const gsl::byte *src, gsl::byte *dest,
+    const runtime_shape_t &shape, const runtime_shape_t &src_strides, const runtime_shape_t &dest_strides, kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> transpose(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &perm, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> binary(binary_op_t op, const float *input_a, const float *input_b, float *output,
+    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
+    const runtime_shape_t &in_b_strides, const runtime_shape_t &out_strides, value_range<float> fused_activation, kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> dequantize(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
+    const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, float scale, float bias,
+    kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> lut1d(datatype_t type, const gsl::byte *input, const gsl::byte *table, gsl::byte *output, const runtime_shape_t &shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const scalar &min, const scalar &max) noexcept;
+
+NNCASE_API result<void> pad(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const runtime_paddings_t &paddings, pad_mode_t mode, const scalar &pad_value,
+    kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> quantize(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
+    const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, float scale, float bias,
+    kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> unary(unary_op_t op, const float *input, float *output, const runtime_shape_t &shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> reduce(reduce_op_t op, float init_value, const float *input, float *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context = default_kernel_context) noexcept;
+
+NNCASE_API result<void> slice(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const runtime_shape_t &begins, const runtime_shape_t &ends, const runtime_axis_t &strides,
+    kernel_context &context = default_kernel_context) noexcept;
+
+END_NS_NNCASE_KERNELS
--- a/lib/nncase/v1/include/nncase/runtime/aixlog.hpp
+++ b/lib/nncase/v1/include/nncase/runtime/aixlog.hpp
--- a/lib/nncase/v1/include/nncase/runtime/allocator.h
+++ b/lib/nncase/v1/include/nncase/runtime/allocator.h
@ -0,0 +1,34 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <memory>
+#include <nncase/runtime/compiler_defs.h>
+
+BEGIN_NS_NNCASE_RUNTIME
+
+class NNCASE_API allocation_state
+{
+public:
+    virtual ~allocation_state();
+};
+
+class NNCASE_API host_allocator
+{
+public:
+    virtual ~host_allocator();
+    virtual gsl::span<gsl::byte> allocate(allocation_state &state, size_t bytes) = 0;
+};
+
+END_NS_NNCASE_RUNTIME
--- a/lib/nncase/v1/include/nncase/runtime/bfloat16.h
+++ b/lib/nncase/v1/include/nncase/runtime/bfloat16.h
@ -0,0 +1,353 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cmath>
+#include <cstdint>
+#include <float.h>
+#include <functional>
+#include <limits>
+#include <nncase/runtime/compiler_defs.h>
+
+namespace nncase
+{
+struct half
+{
+    uint16_t value;
+};
+
+struct from_raw_t
+{
+    explicit from_raw_t() = default;
+};
+
+NNCASE_INLINE_VAR constexpr from_raw_t from_raw {};
+
+struct bfloat16
+{
+private:
+    union fp32
+    {
+        uint32_t u32;
+        float f32;
+
+        uint16_t u16() const noexcept
+        {
+            constexpr size_t index = NNCASE_LITTLE_ENDIAN ? 1 : 0;
+            return reinterpret_cast<const uint16_t *>(&u32)[index];
+        }
+
+        uint16_t &u16() noexcept
+        {
+            constexpr size_t index = NNCASE_LITTLE_ENDIAN ? 1 : 0;
+            return reinterpret_cast<uint16_t *>(&u32)[index];
+        }
+    };
+
+    // A value that represents "zero".
+    static constexpr uint16_t ZERO_VALUE = 0;
+
+    // A value that represents "not a number".
+    static constexpr uint16_t NAN_VALUE = 0x7FC0;
+
+public:
+    bfloat16() noexcept = default;
+
+    explicit bfloat16(float v) noexcept
+        : value_(truncate_to_bfloat16(v).value_) { }
+
+    template <class T, class = std::enable_if_t<std::is_integral<T>::value || std::is_floating_point<T>::value>>
+    explicit bfloat16(const T &val) noexcept
+        : bfloat16(static_cast<float>(val)) { }
+
+    constexpr bfloat16(from_raw_t, uint16_t value) noexcept
+        : value_(value) { }
+
+    operator float() const noexcept
+    {
+        fp32 result;
+        result.u32 = 0;
+        result.u16() = value_;
+        return result.f32;
+    }
+
+    const uint16_t &raw() const noexcept { return value_; }
+    uint16_t &raw() noexcept { return value_; }
+
+    static constexpr bfloat16 from_raw(uint16_t v) noexcept
+    {
+        return bfloat16(nncase::from_raw, v);
+    }
+
+    static bfloat16 truncate_to_bfloat16(const float v) noexcept
+    {
+        bfloat16 output;
+
+        if (!std::isnan(v))
+        {
+            fp32 f;
+            f.f32 = v;
+            output.value_ = f.u16();
+        }
+        else
+        {
+            output.value_ = NAN_VALUE;
+        }
+
+        return output;
+    }
+
+    // Converts a float point to bfloat16, with round-nearest-to-even as rounding
+    // method.
+    static bfloat16 round_to_bfloat16(float v)
+    {
+        uint32_t input;
+        fp32 f;
+        f.f32 = v;
+        input = f.u32;
+        bfloat16 output;
+
+        if (!std::isnan(v))
+        {
+            // Least significant bit of resulting bfloat.
+            uint32_t lsb = (input >> 16) & 1;
+            uint32_t rounding_bias = 0x7fff + lsb;
+            input += rounding_bias;
+            output.value_ = static_cast<uint16_t>(input >> 16);
+        }
+        else
+        {
+            // If the value is a NaN, squash it to a qNaN with msb of fraction set,
+            // this makes sure after truncation we don't end up with an inf.
+            //
+            // qNaN magic: All exponent bits set + most significant bit of fraction
+            // set.
+            output.value_ = NAN_VALUE;
+        }
+
+        return output;
+    }
+
+    static constexpr bfloat16 epsilon() noexcept
+    {
+        // 0x1.0p-7
+        return from_raw(0x3c00);
+    }
+
+    static constexpr bfloat16 highest() noexcept
+    {
+        // 0x1.FEp127
+        return from_raw(0x7F7F);
+    }
+
+    static constexpr bfloat16 min() noexcept
+    {
+        // 0x1p-126
+        return from_raw(0x0080);
+    }
+
+    static constexpr bfloat16 lowest() noexcept
+    {
+        // -0x1.FEp127
+        return from_raw(0xFF7F);
+    }
+
+    static constexpr bfloat16 nan() noexcept
+    {
+        return from_raw(NAN_VALUE);
+    }
+
+    static constexpr bfloat16 infinity() noexcept
+    {
+        return from_raw(0x7f80);
+    }
+
+    constexpr bool zero() const noexcept { return (value_ & 0x7FFF) == ZERO_VALUE; }
+
+private:
+    uint16_t value_;
+};
+
+#define DEFINE_BF16_BINARY_BF16RET(x)                            \
+    inline bfloat16 operator x(bfloat16 a, bfloat16 b) noexcept  \
+    {                                                            \
+        return bfloat16::round_to_bfloat16(float(a) x float(b)); \
+    }
+
+#define DEFINE_BF16_BINARY_BOOLRET(x)                       \
+    inline bool operator x(bfloat16 a, bfloat16 b) noexcept \
+    {                                                       \
+        return float(a) x float(b);                         \
+    }
+
+DEFINE_BF16_BINARY_BF16RET(+)
+DEFINE_BF16_BINARY_BF16RET(-)
+DEFINE_BF16_BINARY_BF16RET(*)
+DEFINE_BF16_BINARY_BF16RET(/)
+DEFINE_BF16_BINARY_BOOLRET(<)
+DEFINE_BF16_BINARY_BOOLRET(<=)
+DEFINE_BF16_BINARY_BOOLRET(>=)
+DEFINE_BF16_BINARY_BOOLRET(>)
+
+#define DEFINE_BF16_BINARY_SELF_MOD(x, op)                        \
+    inline bfloat16 &operator x(bfloat16 &a, bfloat16 b) noexcept \
+    {                                                             \
+        a = a op b;                                               \
+        return a;                                                 \
+    }
+
+DEFINE_BF16_BINARY_SELF_MOD(+=, +)
+DEFINE_BF16_BINARY_SELF_MOD(-=, -)
+DEFINE_BF16_BINARY_SELF_MOD(*=, *)
+DEFINE_BF16_BINARY_SELF_MOD(/=, /)
+
+inline bfloat16 operator-(bfloat16 a) noexcept
+{
+    return bfloat16::round_to_bfloat16(-float(a));
+}
+
+inline bool operator==(const bfloat16 &lhs, const bfloat16 &rhs) noexcept
+{
+    return lhs.raw() == rhs.raw();
+}
+
+inline bool operator!=(const bfloat16 &lhs, const bfloat16 &rhs) noexcept
+{
+    return lhs.raw() != rhs.raw();
+}
+}
+
+namespace std
+{
+template <>
+struct hash<nncase::bfloat16>
+{
+    size_t operator()(const nncase::bfloat16 &v) const
+    {
+        return hash<float>()(static_cast<float>(v));
+    }
+};
+
+template <>
+struct numeric_limits<nncase::bfloat16>
+{
+    static constexpr float_denorm_style has_denorm = denorm_present;
+    static constexpr bool has_infinity = true;
+    static constexpr bool has_quiet_NaN = true;
+    static constexpr bool has_signaling_NaN = true;
+    static constexpr bool is_bounded = true;
+    static constexpr bool is_iec559 = true;
+    static constexpr bool is_signed = true;
+    static constexpr bool is_specialized = true;
+    static constexpr float_round_style round_style = round_to_nearest;
+    static constexpr int radix = FLT_RADIX;
+
+    [[nodiscard]] static constexpr nncase::bfloat16(min)() noexcept
+    {
+        return nncase::bfloat16::min();
+    }
+
+    [[nodiscard]] static constexpr nncase::bfloat16(max)() noexcept
+    {
+        return nncase::bfloat16::highest();
+    }
+
+    [[nodiscard]] static constexpr nncase::bfloat16 lowest() noexcept
+    {
+        return nncase::bfloat16::lowest();
+    }
+
+    [[nodiscard]] static constexpr nncase::bfloat16 epsilon() noexcept
+    {
+        return nncase::bfloat16::epsilon();
+    }
+
+    [[nodiscard]] static constexpr nncase::bfloat16 round_error() noexcept
+    {
+        // 0.5
+        return nncase::bfloat16::from_raw(0x3f00);
+    }
+
+    [[nodiscard]] static constexpr nncase::bfloat16 denorm_min() noexcept
+    {
+        return nncase::bfloat16::min();
+    }
+
+    [[nodiscard]] static constexpr nncase::bfloat16 infinity() noexcept
+    {
+        return nncase::bfloat16::infinity();
+    }
+
+    [[nodiscard]] static constexpr nncase::bfloat16 quiet_NaN() noexcept
+    {
+        return nncase::bfloat16::nan();
+    }
+
+    [[nodiscard]] static constexpr nncase::bfloat16 signaling_NaN() noexcept
+    {
+        return nncase::bfloat16::nan();
+    }
+
+    static constexpr int digits = 8;
+    static constexpr int max_exponent = FLT_MAX_EXP;
+    static constexpr int min_exponent = FLT_MIN_EXP;
+};
+
+using nncase::bfloat16;
+inline bool isinf(const bfloat16 &a) { return std::isinf(float(a)); }
+inline bool isnan(const bfloat16 &a) { return std::isnan(float(a)); }
+inline bool isfinite(const bfloat16 &a) { return std::isfinite(float(a)); }
+inline bfloat16 abs(const bfloat16 &a) { return bfloat16::round_to_bfloat16(fabsf(float(a))); }
+inline bfloat16 exp(const bfloat16 &a) { return bfloat16::round_to_bfloat16(expf(float(a))); }
+inline bfloat16 log(const bfloat16 &a) { return bfloat16::round_to_bfloat16(logf(float(a))); }
+inline bfloat16 log10(const bfloat16 &a)
+{
+    return bfloat16::round_to_bfloat16(log10f(float(a)));
+}
+inline bfloat16 sqrt(const bfloat16 &a)
+{
+    return bfloat16::round_to_bfloat16(sqrtf(float(a)));
+}
+inline bfloat16 pow(const bfloat16 &a, const bfloat16 &b)
+{
+    return bfloat16::round_to_bfloat16(powf(float(a), float(b)));
+}
+inline bfloat16 sin(const bfloat16 &a) { return bfloat16::round_to_bfloat16(sinf(float(a))); }
+inline bfloat16 cos(const bfloat16 &a) { return bfloat16::round_to_bfloat16(cosf(float(a))); }
+inline bfloat16 tan(const bfloat16 &a) { return bfloat16::round_to_bfloat16(tanf(float(a))); }
+inline bfloat16 tanh(const bfloat16 &a)
+{
+    return bfloat16::round_to_bfloat16(tanhf(float(a)));
+}
+inline bfloat16 floor(const bfloat16 &a)
+{
+    return bfloat16::round_to_bfloat16(floorf(float(a)));
+}
+inline bfloat16 ceil(const bfloat16 &a)
+{
+    return bfloat16::round_to_bfloat16(ceilf(float(a)));
+}
+inline bfloat16 round(const bfloat16 &a)
+{
+    return bfloat16::round_to_bfloat16(roundf(float(a)));
+}
+inline bfloat16 nearbyint(const bfloat16 &a)
+{
+    return bfloat16::round_to_bfloat16(nearbyintf(float(a)));
+}
+inline long lrint(const bfloat16 &a)
+{
+    return lrintf(float(a));
+}
+} // namespace std
--- a/lib/nncase/v1/include/nncase/runtime/bitio.h
+++ b/lib/nncase/v1/include/nncase/runtime/bitio.h
@ -0,0 +1,167 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "datatypes.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <span>
+
+namespace nncase::runtime
+{
+class bitreader
+{
+public:
+    bitreader(std::span<const uint8_t> data)
+        : data_(data), buffer_(0), avail_(0) { }
+
+    void read(uint8_t *dest, size_t bits)
+    {
+        while (bits)
+        {
+            auto to_read = std::min(bits, size_t(8));
+            *dest++ = read_bits_le8(to_read);
+            bits -= to_read;
+        }
+    }
+
+    template <class T, size_t Bits>
+    T read()
+    {
+        T ret {};
+        read(reinterpret_cast<uint8_t *>(&ret), Bits);
+        return ret;
+    }
+
+private:
+    uint8_t read_bits_le8(size_t bits)
+    {
+        assert(bits <= 8);
+
+        fill_buffer_le8(bits);
+        uint8_t ret = buffer_ & ((size_t(1) << bits) - 1);
+        buffer_ >>= bits;
+        avail_ -= bits;
+        return ret;
+    }
+
+    void fill_buffer_le8(size_t bits)
+    {
+        if (avail_ < bits)
+        {
+            auto max_read_bytes = std::min(data_.size() * 8, sizeof(buffer_) * 8 - avail_) / 8;
+            assert(max_read_bytes != 0);
+
+            uint64_t tmp = 0;
+            std::memcpy(&tmp, data_.data(), max_read_bytes);
+            data_ = data_.subspan(max_read_bytes);
+            buffer_ = buffer_ | (tmp << avail_);
+            avail_ += max_read_bytes * 8;
+        }
+    }
+
+private:
+    std::span<const uint8_t> data_;
+    uint64_t buffer_;
+    size_t avail_;
+};
+
+class bitwriter
+{
+public:
+    bitwriter(std::span<uint8_t> data, size_t bitoffset = 0)
+        : data_(data), buffer_(0), avail_(sizeof(buffer_) * 8)
+    {
+        if (bitoffset)
+        {
+            data_ = data_.subspan(bitoffset / 8);
+            bitoffset %= 8;
+            buffer_ = data_.front() & ((size_t(1) << bitoffset) - 1);
+            avail_ -= bitoffset;
+        }
+    }
+
+    ~bitwriter() { flush(); }
+
+    void write(const uint8_t *src, size_t bits)
+    {
+        while (bits)
+        {
+            auto to_write = std::min(bits, size_t(8));
+            write_bits_le8(*src++, to_write);
+            bits -= to_write;
+        }
+    }
+
+    template <size_t Bits, class T>
+    void write(T value)
+    {
+        write(reinterpret_cast<const uint8_t *>(&value), Bits);
+    }
+
+    void flush()
+    {
+        auto write_bytes = (buffer_written_bits() + 7) / 8;
+        if (write_bytes)
+        {
+            assert(data_.size() >= write_bytes);
+
+            std::memcpy(data_.data(), &buffer_, write_bytes);
+            data_ = data_.subspan(write_bytes);
+            buffer_ = 0;
+            avail_ = sizeof(buffer_) * 8;
+        }
+    }
+
+private:
+    void write_bits_le8(uint8_t value, size_t bits)
+    {
+        assert(bits <= 8);
+
+        reserve_buffer_8();
+        size_t new_value = value & ((size_t(1) << bits) - 1);
+        buffer_ = buffer_ | (new_value << buffer_written_bits());
+        avail_ -= bits;
+    }
+
+    void reserve_buffer_8()
+    {
+        if (avail_ < 8)
+        {
+            auto write_bytes = buffer_written_bits() / 8;
+            assert(data_.size() >= write_bytes);
+
+            std::memcpy(data_.data(), &buffer_, write_bytes);
+            data_ = data_.subspan(write_bytes);
+            if (write_bytes == sizeof(buffer_))
+                buffer_ = 0;
+            else
+                buffer_ >>= write_bytes * 8;
+            avail_ += write_bytes * 8;
+        }
+    }
+
+    size_t buffer_written_bits() const noexcept
+    {
+        return sizeof(buffer_) * 8 - avail_;
+    }
+
+private:
+    std::span<uint8_t> data_;
+    uint64_t buffer_;
+    size_t avail_;
+};
+}
--- a/lib/nncase/v1/include/nncase/runtime/compiler_defs.h
+++ b/lib/nncase/v1/include/nncase/runtime/compiler_defs.h
@ -0,0 +1,107 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <gsl/gsl-lite.hpp>
+#include <type_traits>
+
+#if defined(_MSC_VER)
+#ifdef NNCASE_DLL
+#define NNCASE_API __declspec(dllexport)
+#elif defined(NNCASE_SHARED_LIBS)
+#define NNCASE_API __declspec(dllimport)
+#else
+#define NNCASE_API
+#endif
+#else
+#define NNCASE_API
+#endif
+
+#if defined(_MSC_VER)
+#define NNCASE_UNREACHABLE() __assume(0)
+#else
+#define NNCASE_UNREACHABLE() __builtin_unreachable()
+#endif
+
+#if gsl_CPP17_OR_GREATER
+#define NNCASE_INLINE_VAR inline
+#define NNCASE_UNUSED [[maybe_unused]]
+namespace nncase
+{
+template <class Callable, class... Args>
+using invoke_result_t = std::invoke_result_t<Callable, Args...>;
+}
+#else
+#define NNCASE_INLINE_VAR
+#if defined(_MSC_VER)
+#define NNCASE_UNUSED
+#else
+#define NNCASE_UNUSED __attribute__((unused))
+#endif
+namespace nncase
+{
+template <class Callable, class... Args>
+using invoke_result_t = std::result_of_t<Callable(Args...)>;
+}
+#endif
+
+#define NNCASE_LITTLE_ENDIAN 1
+
+#define NNCASE_HAVE_STD_BYTE gsl_CPP17_OR_GREATER
+#define NNCASE_NODISCARD gsl_NODISCARD
+#define NNCASE_NORETURN gsl_NORETURN
+
+#define BEGIN_NS_NNCASE_RUNTIME \
+    namespace nncase            \
+    {                           \
+        namespace runtime       \
+        {
+#define END_NS_NNCASE_RUNTIME \
+    }                         \
+    }
+
+#define BEGIN_NS_NNCASE_RT_STACKVM \
+    namespace nncase               \
+    {                              \
+        namespace runtime          \
+        {                          \
+            namespace stackvm      \
+            {
+#define END_NS_NNCASE_RT_STACKVM \
+    }                            \
+    }                            \
+    }
+
+#define BEGIN_NS_NNCASE_KERNELS \
+    namespace nncase            \
+    {                           \
+        namespace kernels       \
+        {
+
+#define END_NS_NNCASE_KERNELS \
+    }                         \
+    }
+
+#ifndef DEFINE_ENUM_BITMASK_OPERATORS
+#define DEFINE_ENUM_BITMASK_OPERATORS(ENUMTYPE) gsl_DEFINE_ENUM_BITMASK_OPERATORS(ENUMTYPE)
+#endif
+
+namespace nncase
+{
+struct default_init_t
+{
+};
+
+NNCASE_INLINE_VAR constexpr default_init_t default_init {};
+}
--- a/lib/nncase/v1/include/nncase/runtime/datatypes.def
+++ b/lib/nncase/v1/include/nncase/runtime/datatypes.def
@ -0,0 +1,12 @@
+DEFINE_DATATYPE(int8,       int8_t,     i8,     0x00)
+DEFINE_DATATYPE(int16,      int16_t,    i16,    0x01)
+DEFINE_DATATYPE(int32,      int32_t,    i32,    0x02)
+DEFINE_DATATYPE(int64,      int64_t,    i64,    0x03)
+DEFINE_DATATYPE(uint8,      uint8_t,    u8,     0x04)
+DEFINE_DATATYPE(uint16,     uint16_t,   u16,    0x05)
+DEFINE_DATATYPE(uint32,     uint32_t,   u32,    0x06)
+DEFINE_DATATYPE(uint64,     uint64_t,   u64,    0x07)
+DEFINE_DATATYPE(float16,    half,       f16,    0x08)
+DEFINE_DATATYPE(float32,    float,      f32,    0x09)
+DEFINE_DATATYPE(float64,    double,     f64,    0x0A)
+DEFINE_DATATYPE(bfloat16,   bfloat16,   bf16,   0x0B)
--- a/lib/nncase/v1/include/nncase/runtime/datatypes.h
+++ b/lib/nncase/v1/include/nncase/runtime/datatypes.h
@ -0,0 +1,436 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "bfloat16.h"
+#include "compiler_defs.h"
+#include "small_vector.hpp"
+#include <array>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <numeric>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+namespace nncase
+{
+typedef enum _datatype : uint8_t
+{
+#define DEFINE_DATATYPE(id, t, name, value) dt_##id = value,
+#include "datatypes.def"
+#undef DEFINE_DATATYPE
+} datatype_t;
+
+namespace detail
+{
+    template <datatype_t Type>
+    struct datatype_to_cpp_type
+    {
+    };
+
+    template <class T>
+    struct cpp_type_to_datatype
+    {
+    };
+
+#if NNCASE_HAVE_STD_BYTE
+    template <>
+    struct cpp_type_to_datatype<std::byte>
+    {
+        static constexpr datatype_t type = dt_uint8;
+    };
+#endif
+
+#define DEFINE_DATATYPE(id, t, name, value)         \
+    template <>                                     \
+    struct datatype_to_cpp_type<dt_##id>            \
+    {                                               \
+        using type = t;                             \
+    };                                              \
+    template <>                                     \
+    struct cpp_type_to_datatype<t>                  \
+    {                                               \
+        static constexpr datatype_t type = dt_##id; \
+    };
+#include "datatypes.def"
+#undef DEFINE_DATATYPE
+
+    inline constexpr size_t datatype_bytes(datatype_t type)
+    {
+        switch (type)
+        {
+#define DEFINE_DATATYPE(id, t, name, value) \
+    case (dt_##id):                         \
+        return sizeof(t);
+#include "datatypes.def"
+#undef DEFINE_DATATYPE
+        default:
+            return -1;
+        }
+    }
+
+}
+
+template <class T>
+constexpr datatype_t to_datatype() noexcept
+{
+    return detail::cpp_type_to_datatype<T>::type;
+}
+
+template <datatype_t Type>
+using to_cpp_type_t = typename detail::datatype_to_cpp_type<Type>::type;
+
+struct padding
+{
+    int32_t before;
+    int32_t after;
+
+    int32_t sum() const noexcept { return before + after; }
+
+    static padding zero() noexcept { return {}; }
+};
+
+template <class T>
+struct value_range
+{
+    T min;
+    T max;
+
+    static constexpr value_range<T> full() noexcept
+    {
+        if (std::is_floating_point<T>::value || std::is_same<T, bfloat16>::value)
+            return { -std::numeric_limits<T>::infinity(), std::numeric_limits<T>::infinity() };
+        else
+            return { std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max() };
+    }
+
+    static constexpr value_range<T> nonnegative() noexcept
+    {
+        return { 0, std::numeric_limits<T>::max() };
+    }
+
+    constexpr T length() const noexcept { return max - min; }
+};
+
+typedef enum _reduce_op
+{
+    reduce_mean,
+    reduce_min,
+    reduce_max,
+    reduce_sum
+} reduce_op_t;
+
+typedef enum _binary_op
+{
+    binary_add,
+    binary_sub,
+    binary_mul,
+    binary_div,
+    binary_min,
+    binary_max,
+    binary_pow,
+    binary_floor_div,
+    binary_floor_mod,
+    binary_bitwise_and,
+    binary_bitwise_or,
+    binary_bitwise_xor,
+    binary_logical_and,
+    binary_logical_or,
+    binary_logical_xor
+} binary_op_t;
+
+inline std::string binary_op_to_string(binary_op_t op)
+{
+    switch (op)
+    {
+    case binary_add:
+        return "binary_add";
+    case binary_sub:
+        return "binary_sub";
+    case binary_mul:
+        return "binary_mul";
+    case binary_div:
+        return "binary_div";
+    case binary_min:
+        return "binary_min";
+    case binary_max:
+        return "binary_max";
+    case binary_pow:
+        return "binary_pow";
+    case binary_floor_div:
+        return "binary_floor_div";
+    case binary_floor_mod:
+        return "binary_floor_mod";
+    case binary_bitwise_and:
+        return "binary_bitwise_and";
+    case binary_bitwise_or:
+        return "binary_bitwise_or";
+    case binary_bitwise_xor:
+        return "binary_bitwise_xor";
+    case binary_logical_and:
+        return "binary_logical_and";
+    case binary_logical_or:
+        return "binary_logical_or";
+    case binary_logical_xor:
+        return "binary_logical_xor";
+    }
+    return "unknown";
+}
+
+typedef enum _unary_op
+{
+    unary_abs,
+    unary_ceil,
+    unary_cos,
+    unary_exp,
+    unary_floor,
+    unary_log,
+    unary_neg,
+    unary_round,
+    unary_rsqrt,
+    unary_sin,
+    unary_sqrt,
+    unary_square,
+    unary_tanh,
+    unary_bitwise_not,
+    unary_logical_not
+} unary_op_t;
+
+inline std::string unary_op_to_string(unary_op_t op)
+{
+    switch (op)
+    {
+    case unary_abs:
+        return "unary_abs";
+    case unary_ceil:
+        return "unary_ceil";
+    case unary_cos:
+        return "unary_cos";
+    case unary_exp:
+        return "unary_exp";
+    case unary_floor:
+        return "unary_floor";
+    case unary_log:
+        return "unary_log";
+    case unary_neg:
+        return "unary_neg";
+    case unary_round:
+        return "unary_round";
+    case unary_rsqrt:
+        return "unary_rsqrt";
+    case unary_sin:
+        return "unary_sin";
+    case unary_sqrt:
+        return "unary_sqrt";
+    case unary_square:
+        return "unary_square";
+    case unary_tanh:
+        return "unary_tanh";
+    case unary_bitwise_not:
+        return "unary_bitwise_not";
+    case unary_logical_not:
+        return "unary_logical_not";
+    }
+    return "unknown";
+}
+
+typedef enum _image_resize_mode
+{
+    image_resize_bilinear,
+    image_resize_nearest_neighbor
+} image_resize_mode_t;
+
+typedef enum _pad_mode
+{
+    pad_constant,
+    pad_reflect,
+    pad_symmetric,
+    pad_edge
+} pad_mode_t;
+
+typedef struct _quant_param
+{
+    int32_t zero_point;
+    float scale;
+
+    template <class T>
+    constexpr value_range<float> range() const noexcept
+    {
+        return {
+            (std::numeric_limits<T>::lowest() - zero_point) * scale, (std::numeric_limits<T>::max() - zero_point) * scale
+        };
+    }
+} quant_param_t;
+
+inline bool operator==(const quant_param_t &lhs, const quant_param_t &rhs) noexcept
+{
+    return lhs.zero_point == rhs.zero_point && lhs.scale == rhs.scale;
+}
+
+inline bool almost_equal(const quant_param_t &lhs, const quant_param_t &rhs) noexcept
+{
+    return lhs.zero_point == rhs.zero_point
+        && fabs(lhs.scale - rhs.scale) <= std::numeric_limits<float>::epsilon();
+}
+
+struct fixed_mul
+{
+    float mul;
+    int8_t shift;
+
+    int32_t rounded_mul() const noexcept { return (int32_t)lrintf(mul); }
+};
+
+using memory_location_t = uint8_t;
+NNCASE_INLINE_VAR constexpr memory_location_t mem_input = 0;
+NNCASE_INLINE_VAR constexpr memory_location_t mem_output = 1;
+NNCASE_INLINE_VAR constexpr memory_location_t mem_rdata = 2;
+NNCASE_INLINE_VAR constexpr memory_location_t mem_data = 3;
+
+using runtime_shape_t = itlib::small_vector<size_t, 4>;
+using runtime_axis_t = itlib::small_vector<int32_t, 4>;
+using runtime_paddings_t = itlib::small_vector<padding, 4>;
+
+struct scalar
+{
+    datatype_t type;
+    std::aligned_storage_t<8> storage;
+
+    scalar() = default;
+
+    scalar(int8_t value) noexcept
+    {
+        type = dt_int8;
+        as<int8_t>() = value;
+    }
+
+    scalar(int16_t value) noexcept
+    {
+        type = dt_int16;
+        as<int16_t>() = value;
+    }
+
+    scalar(int32_t value) noexcept
+    {
+        type = dt_int32;
+        as<int32_t>() = value;
+    }
+
+    scalar(uint8_t value) noexcept
+    {
+        type = dt_uint8;
+        as<uint8_t>() = value;
+    }
+
+    scalar(uint16_t value) noexcept
+    {
+        type = dt_uint16;
+        as<uint16_t>() = value;
+    }
+
+    scalar(uint32_t value) noexcept
+    {
+        type = dt_uint32;
+        as<uint32_t>() = value;
+    }
+
+    scalar(bfloat16 value) noexcept
+    {
+        type = dt_bfloat16;
+        as<bfloat16>() = value;
+    }
+
+    scalar(float value) noexcept
+    {
+        type = dt_float32;
+        as<float>() = value;
+    }
+
+    template <class T>
+    T &as() noexcept { return *reinterpret_cast<T *>(&storage); }
+
+    template <class T>
+    const T &as() const noexcept { return *reinterpret_cast<const T *>(&storage); }
+};
+
+struct memory_range
+{
+    memory_location_t memory_location;
+    datatype_t datatype;
+    uint16_t reserved0;
+    uint32_t start;
+    uint32_t size;
+};
+
+NNCASE_INLINE_VAR constexpr size_t MAX_MODULE_TYPE_LENGTH = 16;
+
+typedef std::array<char, MAX_MODULE_TYPE_LENGTH> module_type_t;
+
+template <std::size_t N, std::size_t... Is>
+constexpr module_type_t
+to_module_type(const char (&a)[N], std::index_sequence<Is...>)
+{
+    return { { a[Is]... } };
+}
+
+template <std::size_t N>
+constexpr module_type_t to_module_type(const char (&a)[N])
+{
+    return to_module_type(a, std::make_index_sequence<N>());
+}
+
+inline padding operator+(const padding &lhs, const padding &rhs) noexcept
+{
+    return { lhs.before + rhs.before, lhs.after + rhs.after };
+}
+
+inline bool operator==(const padding &lhs, const padding &rhs) noexcept
+{
+    return lhs.before == rhs.before && lhs.after == rhs.after;
+}
+
+inline bool operator!=(const padding &lhs, const padding &rhs) noexcept
+{
+    return lhs.before != rhs.before || lhs.after != rhs.after;
+}
+
+template <class T>
+bool operator==(const value_range<T> &lhs, const value_range<T> &rhs) noexcept
+{
+    return lhs.min == rhs.min && lhs.max == rhs.max;
+}
+
+template <class T>
+bool operator!=(const value_range<T> &lhs, const value_range<T> &rhs) noexcept
+{
+    return lhs.min != rhs.min || lhs.max != rhs.max;
+}
+
+inline bool operator==(const scalar &lhs, const scalar &rhs) noexcept
+{
+    auto valid_bytes = detail::datatype_bytes(lhs.type);
+    return lhs.type == rhs.type && !memcmp(&lhs.storage, &rhs.storage, valid_bytes);
+}
+
+inline bool operator!=(const scalar &lhs, const scalar &rhs) noexcept
+{
+    auto valid_bytes = detail::datatype_bytes(lhs.type);
+    return lhs.type != rhs.type || memcmp(&lhs.storage, &rhs.storage, valid_bytes);
+}
+}
--- a/lib/nncase/v1/include/nncase/runtime/dbg.h
+++ b/lib/nncase/v1/include/nncase/runtime/dbg.h
--- a/lib/nncase/v1/include/nncase/runtime/error.h
+++ b/lib/nncase/v1/include/nncase/runtime/error.h
@ -0,0 +1,48 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "compiler_defs.h"
+#include <system_error>
+
+BEGIN_NS_NNCASE_RUNTIME
+
+enum class nncase_errc
+{
+    invalid_model_indentifier = 0x01,
+    invalid_model_checksum = 0x02,
+    invalid_model_version = 0x03,
+    runtime_not_found = 0x04,
+    datatype_mismatch = 0x05,
+    shape_mismatch = 0x06,
+    invalid_memory_location = 0x07,
+    stackvm_illegal_instruction = 0x0100,
+    stackvm_illegal_target = 0x0101,
+    stackvm_stack_overflow = 0x0102,
+    stackvm_stack_underflow = 0x0103,
+    nnil_illegal_instruction = 0x0200,
+};
+
+NNCASE_API const std::error_category &nncase_category() noexcept;
+NNCASE_API std::error_condition make_error_condition(nncase_errc code);
+
+END_NS_NNCASE_RUNTIME
+
+namespace std
+{
+template <>
+struct is_error_condition_enum<nncase::runtime::nncase_errc> : true_type
+{
+};
+}
--- a/lib/nncase/v1/include/nncase/runtime/host_runtime_tensor.h
+++ b/lib/nncase/v1/include/nncase/runtime/host_runtime_tensor.h
@ -0,0 +1,96 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "runtime_tensor_impl.h"
+#include "shared_runtime_tensor.h"
+
+BEGIN_NS_NNCASE_RUNTIME
+
+namespace detail
+{
+enum class cache_status_t
+{
+    valid,
+    need_invalidate,
+    need_write_back
+};
+
+struct host_memory_block
+{
+    host_runtime_tensor::memory_pool_t pool;
+    uintptr_t virtual_address;
+    size_t size_bytes;
+    host_runtime_tensor::data_deleter_t deleter;
+    cache_status_t cache_status;
+    physical_memory_block physical_block;
+
+    host_memory_block() = default;
+    host_memory_block(const host_memory_block &) = delete;
+    host_memory_block(host_memory_block && other) noexcept;
+    host_memory_block &operator=(const host_memory_block &) = delete;
+    host_memory_block &operator=(host_memory_block && other) noexcept;
+
+    ~host_memory_block()
+    {
+        free();
+    }
+
+    void free()
+    {
+        if (auto d = std::move(deleter))
+            d(reinterpret_cast<gsl::byte *>(virtual_address));
+        deleter = {};
+    }
+
+    gsl::span<gsl::byte> virtual_buffer() const noexcept
+    {
+        return { reinterpret_cast<gsl::byte *>(virtual_address), size_bytes };
+    }
+};
+
+class NNCASE_API host_runtime_tensor_impl : public runtime_tensor_impl
+{
+public:
+    host_runtime_tensor_impl(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, host_memory_block memory_block);
+
+    datatype_t datatype() const noexcept override;
+    const runtime_shape_t &shape() const noexcept override;
+    const runtime_shape_t &strides() const noexcept override;
+    runtime_tensor_type &tensor_type() const noexcept override;
+
+    bool can_copy_from_different_type(const runtime_tensor_impl &src) const noexcept override;
+    bool can_copy_to_different_type(const runtime_tensor_impl &dest) const noexcept override;
+
+    result<void> copy_to_same_type(runtime_tensor_impl &dest) noexcept override;
+    result<void> copy_from_different_type(runtime_tensor_impl &src) noexcept override;
+    result<void> copy_to_different_type(runtime_tensor_impl &dest) noexcept override;
+    result<void> copy_from_host(runtime_tensor_impl &src) noexcept override;
+    result<void> copy_to_host(runtime_tensor_impl &dest) noexcept override;
+
+    result<host_runtime_tensor::mapped_buffer> map(host_runtime_tensor::map_access_t access) noexcept;
+    result<void> unmap(host_runtime_tensor::map_access_t access) noexcept;
+    result<void> sync(host_runtime_tensor::sync_op_t op, bool force = false) noexcept;
+    const host_memory_block &memory_block() const noexcept { return memory_block_; }
+    host_memory_block &memory_block() noexcept { return memory_block_; }
+
+private:
+    datatype_t datatype_;
+    runtime_shape_t shape_;
+    runtime_shape_t strides_;
+    host_memory_block memory_block_;
+};
+}
+
+END_NS_NNCASE_RUNTIME
--- a/lib/nncase/v1/include/nncase/runtime/interpreter.h
+++ b/lib/nncase/v1/include/nncase/runtime/interpreter.h
@ -0,0 +1,81 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "allocator.h"
+#include "model.h"
+#include "result.h"
+#include "runtime_module.h"
+#include <gsl/gsl-lite.hpp>
+#include <memory>
+#include <unordered_map>
+
+BEGIN_NS_NNCASE_RUNTIME
+
+class NNCASE_API options_dict
+{
+public:
+    template <class T>
+    result<T> get(const char *name)
+    {
+        auto it = values_.find(name);
+        if (it != values_.end())
+            return ok(it->second.as<T>());
+        else
+            return err(std::errc::result_out_of_range);
+    }
+
+    template <class T>
+    result<void> set(const char *name, T value)
+    {
+        values_[name] = scalar(value);
+        return ok();
+    }
+
+private:
+    std::unordered_map<const char *, scalar> values_;
+};
+
+class NNCASE_API interpreter
+{
+public:
+    interpreter() noexcept;
+    interpreter(interpreter &) = delete;
+    interpreter(interpreter &&) = default;
+
+    NNCASE_NODISCARD result<void> load_model(gsl::span<const gsl::byte> buffer) noexcept;
+
+    size_t inputs_size() const noexcept;
+    size_t outputs_size() const noexcept;
+    const memory_range &input_desc(size_t index) const noexcept;
+    const memory_range &output_desc(size_t index) const noexcept;
+    const runtime_shape_t &input_shape(size_t index) const noexcept;
+    const runtime_shape_t &output_shape(size_t index) const noexcept;
+    result<runtime_tensor> input_tensor(size_t index) noexcept;
+    result<void> input_tensor(size_t index, runtime_tensor tensor) noexcept;
+    result<runtime_tensor> output_tensor(size_t index) noexcept;
+    result<void> output_tensor(size_t index, runtime_tensor tensor) noexcept;
+
+    result<void> run() noexcept;
+
+    result<runtime_module *> find_module_by_id(size_t index) noexcept;
+    options_dict &options() noexcept;
+
+private:
+    std::vector<std::unique_ptr<runtime_module>> modules_;
+    runtime_module *main_module_;
+    options_dict options_;
+};
+
+END_NS_NNCASE_RUNTIME
--- a/lib/nncase/v1/include/nncase/runtime/k210/compiler_defs.h
+++ b/lib/nncase/v1/include/nncase/runtime/k210/compiler_defs.h
@ -0,0 +1,50 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <nncase/runtime/compiler_defs.h>
+
+#if defined(_MSC_VER)
+#ifdef NNCASE_MODULES_K210_DLL
+#define NNCASE_MODULES_K210_API __declspec(dllexport)
+#else
+#define NNCASE_MODULES_K210_API __declspec(dllimport)
+#endif
+#else
+#define NNCASE_MODULES_K210_API
+#endif
+
+#define BEGIN_NS_NNCASE_RT_K210 \
+    namespace nncase            \
+    {                           \
+    namespace runtime           \
+    {                           \
+        namespace k210          \
+        {
+#define END_NS_NNCASE_RT_K210 \
+    }                         \
+    }                         \
+    }
+
+#define BEGIN_NS_NNCASE_KERNELS_K210 \
+    namespace nncase                 \
+    {                                \
+    namespace kernels                \
+    {                                \
+        namespace k210               \
+        {
+#define END_NS_NNCASE_KERNELS_K210 \
+    }                              \
+    }                              \
+    }
--- a/lib/nncase/v1/include/nncase/runtime/k210/error.h
+++ b/lib/nncase/v1/include/nncase/runtime/k210/error.h
@ -0,0 +1,37 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "compiler_defs.h"
+#include <nncase/runtime/error.h>
+
+BEGIN_NS_NNCASE_RT_K210
+
+enum class nncase_k210_errc
+{
+    k210_illegal_instruction = 0x01
+};
+
+NNCASE_MODULES_K210_API const std::error_category &nncase_k210_category() noexcept;
+NNCASE_MODULES_K210_API std::error_condition make_error_condition(nncase_k210_errc code);
+
+END_NS_NNCASE_RT_K210
+
+namespace std
+{
+template <>
+struct is_error_condition_enum<nncase::runtime::k210::nncase_k210_errc> : true_type
+{
+};
+}
--- a/lib/nncase/v1/include/nncase/runtime/k210/kernel_context.h
+++ b/lib/nncase/v1/include/nncase/runtime/k210/kernel_context.h
@ -0,0 +1,24 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../runtime_module.h"
+#include <nncase/kernels/kernel_context.h>
+NNCASE_MODULES_K210_API
+
+struct NNCASE_API k210_kernel_context : public kernels::kernel_context
+{
+};
+
+END_NS_NNCASE_KERNELS_K210
--- a/lib/nncase/v1/include/nncase/runtime/k210/op_reader.h
+++ b/lib/nncase/v1/include/nncase/runtime/k210/op_reader.h
@ -0,0 +1,48 @@
+/* This file is generated by tools/stackvm_gen/IsaGen at 2021/2/23 16:24:09 +08:00.
+ *
+ * Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "runtime_types.h"
+#include <nncase/runtime/result.h>
+#include <nncase/runtime/span_reader.h>
+
+BEGIN_NS_NNCASE_RT_K210
+
+class NNCASE_MODULES_K210_API op_visitor
+{
+public:
+    op_visitor() noexcept
+        : reader_({})
+    {
+    }
+
+    ~op_visitor() = default;
+
+    result<void> visit(gsl::span<const gsl::byte> text) noexcept;
+
+    virtual result<void> visit(NNCASE_UNUSED const kpu_download_options &op) noexcept { return ok(); }
+    virtual result<void> visit(NNCASE_UNUSED const kpu_conv2d_options &op) noexcept { return ok(); }
+    virtual result<void> visit(NNCASE_UNUSED const kpu_upload_options &op) noexcept { return ok(); }
+
+protected:
+    bool interrupted_;
+    span_reader reader_;
+
+private:
+    result<void> next() noexcept;
+};
+
+END_NS_NNCASE_RT_K210
--- a/lib/nncase/v1/include/nncase/runtime/k210/runtime_module.h
+++ b/lib/nncase/v1/include/nncase/runtime/k210/runtime_module.h
@ -0,0 +1,25 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "compiler_defs.h"
+#include <nncase/runtime/runtime_module.h>
+
+BEGIN_NS_NNCASE_RT_K210
+
+NNCASE_INLINE_VAR constexpr module_type_t k210_module_type = to_module_type("k210");
+
+NNCASE_MODULES_K210_API result<std::unique_ptr<runtime_module>> create_k210_runtime_module();
+
+END_NS_NNCASE_RT_K210
--- a/lib/nncase/v1/include/nncase/runtime/k210/runtime_op_utility.h
+++ b/lib/nncase/v1/include/nncase/runtime/k210/runtime_op_utility.h
@ -0,0 +1,187 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "runtime_types.h"
+
+BEGIN_NS_NNCASE_RT_K210
+
+struct kpu_layout
+{
+    size_t groups;
+    size_t row_len;
+    size_t row_pitch;
+};
+
+inline kpu_layout get_kpu_row_layout(size_t width)
+{
+    kpu_layout layout;
+
+    if (width <= 16)
+    {
+        layout.groups = 4;
+        layout.row_len = 1;
+        layout.row_pitch = 16;
+    }
+    else if (width <= 32)
+    {
+        layout.groups = 2;
+        layout.row_len = 1;
+        layout.row_pitch = 32;
+    }
+    else
+    {
+        layout.groups = 1;
+        layout.row_len = (width + 63) / 64;
+        layout.row_pitch = 64;
+    }
+
+    return layout;
+}
+
+inline int32_t get_kpu_filter_size(kpu_filter_type_t filter)
+{
+    switch (filter)
+    {
+    case kpu_filter_1x1:
+        return 1;
+    case kpu_filter_3x3:
+        return 3;
+    default:
+        return 0;
+    }
+}
+
+inline int32_t get_kpu_padding(kpu_filter_type_t filter)
+{
+    switch (filter)
+    {
+    case kpu_filter_1x1:
+        return 0;
+    case kpu_filter_3x3:
+        return 1;
+    default:
+        NNCASE_UNREACHABLE();
+    }
+}
+
+inline std::array<int32_t, 2> get_kpu_padding(kpu_pool_type_t filter, NNCASE_UNUSED int32_t size)
+{
+    switch (filter)
+    {
+    case kpu_pool_bypass:
+        return { 0, 0 };
+    case kpu_pool_max_2_s2:
+    case kpu_pool_mean_2_s2:
+    case kpu_pool_left_top_2_s2:
+    case kpu_pool_right_top_2_s2:
+        return { 0, 0 };
+    case kpu_pool_max_4_s4:
+    case kpu_pool_mean_4_s4:
+    case kpu_pool_left_top_4_s4:
+        return { 0, 0 };
+    case kpu_pool_mean_2_s1:
+    case kpu_pool_max_2_s1:
+        return { 0, 1 };
+    default:
+        NNCASE_UNREACHABLE();
+    }
+}
+
+inline size_t get_kpu_rows(size_t width, size_t height, size_t channels)
+{
+    auto layout = get_kpu_row_layout(width);
+    auto one_line_channels = std::min(channels, layout.groups);
+    auto blocks = (channels + one_line_channels - 1) / one_line_channels;
+    auto size = layout.row_len * height * blocks;
+    return size;
+}
+
+inline size_t get_kpu_bytes(size_t width, size_t height, size_t channels)
+{
+    return get_kpu_rows(width, height, channels) * 64;
+}
+
+template <class TShape>
+int32_t get_kpu_bytes(const TShape &shape)
+{
+    return get_kpu_bytes(shape[3], shape[2], shape[1]) * shape[0];
+}
+
+inline int32_t get_kpu_filter_size(kpu_pool_type_t filter)
+{
+    switch (filter)
+    {
+    case kpu_pool_bypass:
+        return 1;
+    case kpu_pool_max_2_s2:
+    case kpu_pool_mean_2_s2:
+    case kpu_pool_left_top_2_s2:
+    case kpu_pool_right_top_2_s2:
+    case kpu_pool_max_2_s1:
+    case kpu_pool_mean_2_s1:
+        return 2;
+    case kpu_pool_max_4_s4:
+    case kpu_pool_mean_4_s4:
+    case kpu_pool_left_top_4_s4:
+        return 4;
+    default:
+        NNCASE_UNREACHABLE();
+    }
+}
+
+inline int32_t get_kpu_filter_stride(kpu_pool_type_t filter)
+{
+    switch (filter)
+    {
+    case kpu_pool_bypass:
+        return 1;
+    case kpu_pool_max_2_s2:
+    case kpu_pool_mean_2_s2:
+    case kpu_pool_left_top_2_s2:
+    case kpu_pool_right_top_2_s2:
+        return 2;
+    case kpu_pool_max_2_s1:
+    case kpu_pool_mean_2_s1:
+        return 1;
+    case kpu_pool_max_4_s4:
+    case kpu_pool_mean_4_s4:
+    case kpu_pool_left_top_4_s4:
+        return 4;
+    default:
+        NNCASE_UNREACHABLE();
+    }
+}
+
+inline int32_t get_kpu_pool_output_size(int32_t input, kpu_pool_type_t pool_type)
+{
+    return input / get_kpu_filter_stride(pool_type);
+}
+
+inline std::array<int32_t, 2> get_kpu_select_pool_offset(kpu_pool_type_t pool_type)
+{
+    switch (pool_type)
+    {
+    case kpu_pool_left_top_2_s2:
+        return { 0, 0 };
+    case kpu_pool_right_top_2_s2:
+        return { 0, 1 };
+    case kpu_pool_left_top_4_s4:
+        return { 0, 0 };
+    default:
+        NNCASE_UNREACHABLE();
+    }
+}
+
+END_NS_NNCASE_RT_K210
--- a/lib/nncase/v1/include/nncase/runtime/k210/runtime_types.h
+++ b/lib/nncase/v1/include/nncase/runtime/k210/runtime_types.h
@ -0,0 +1,331 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "compiler_defs.h"
+#include <nncase/runtime/datatypes.h>
+
+BEGIN_NS_NNCASE_RT_K210
+
+NNCASE_INLINE_VAR constexpr memory_location_t mem_kpu = 4;
+NNCASE_INLINE_VAR constexpr size_t KPU_RAM_SIZE = 2 * 1024 * 1024; // 2MB
+
+typedef struct
+{
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t int_en : 1;
+            uint64_t ram_flag : 1;
+            uint64_t full_add : 1;
+            uint64_t depth_wise_layer : 1;
+            uint64_t reserved : 60;
+        } data;
+    } interrupt_enabe;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t image_src_addr : 15;
+            uint64_t reserved0 : 17;
+            uint64_t image_dst_addr : 15;
+            uint64_t reserved1 : 17;
+        } data;
+    } image_addr;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t i_ch_num : 10;
+            uint64_t reserved0 : 22;
+            uint64_t o_ch_num : 10;
+            uint64_t reserved1 : 6;
+            uint64_t o_ch_num_coef : 10;
+            uint64_t reserved2 : 6;
+        } data;
+    } image_channel_num;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t i_row_wid : 10;
+            uint64_t i_col_high : 9;
+            uint64_t reserved0 : 13;
+            uint64_t o_row_wid : 10;
+            uint64_t o_col_high : 9;
+            uint64_t reserved1 : 13;
+        } data;
+    } image_size;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t kernel_type : 3;
+            uint64_t pad_type : 1;
+            uint64_t pool_type : 4;
+            uint64_t first_stride : 1;
+            uint64_t bypass_conv : 1;
+            uint64_t load_para : 1;
+            uint64_t reserved0 : 5;
+            uint64_t dma_burst_size : 8;
+            uint64_t pad_value : 8;
+            uint64_t bwsx_base_addr : 32;
+        } data;
+    } kernel_pool_type_cfg;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t load_coor : 1;
+            uint64_t load_time : 6;
+            uint64_t reserved0 : 8;
+            uint64_t para_size : 17;
+            uint64_t para_start_addr : 32;
+        } data;
+    } kernel_load_cfg;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t coef_column_offset : 4;
+            uint64_t coef_row_offset : 12;
+            uint64_t reserved0 : 48;
+        } data;
+    } kernel_offset;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t channel_switch_addr : 15;
+            uint64_t reserved : 1;
+            uint64_t row_switch_addr : 4;
+            uint64_t coef_size : 8;
+            uint64_t coef_group : 3;
+            uint64_t load_act : 1;
+            uint64_t active_addr : 32;
+        } data;
+    } kernel_calc_type_cfg;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t wb_channel_switch_addr : 15;
+            uint64_t reserved0 : 1;
+            uint64_t wb_row_switch_addr : 4;
+            uint64_t wb_group : 3;
+            uint64_t reserved1 : 41;
+        } data;
+    } write_back_cfg;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t shr_w : 4;
+            uint64_t shr_x : 4;
+            uint64_t arg_w : 24;
+            uint64_t arg_x : 24;
+            uint64_t reserved0 : 8;
+        } data;
+    } conv_value;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t arg_add : 40;
+            uint64_t reserved : 24;
+        } data;
+    } conv_value2;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t send_data_out : 1;
+            uint64_t reserved : 15;
+            uint64_t channel_byte_num : 16;
+            uint64_t dma_total_byte : 32;
+        } data;
+    } dma_parameter;
+} kpu_layer_argument_t;
+
+typedef struct
+{
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t shift_number : 8;
+            uint64_t y_mul : 16;
+            uint64_t x_start : 36;
+        } data;
+    } activate_para[16];
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint8_t result_bias[8];
+        } data;
+    } activate_para_bias0;
+
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint8_t result_bias[8];
+        } data;
+    } activate_para_bias1;
+} kpu_activate_table_t;
+
+typedef struct
+{
+    union
+    {
+        uint64_t reg;
+        struct
+        {
+            uint64_t norm_mul : 24;
+            uint64_t norm_add : 32;
+            uint64_t norm_shift : 4;
+        } data;
+    } batchnorm;
+} kpu_batchnorm_argument_t;
+
+typedef enum _kpu_filter_type
+{
+    kpu_filter_1x1 = 0,
+    kpu_filter_3x3 = 1
+} kpu_filter_type_t;
+
+typedef enum _kpu_pool_type
+{
+    kpu_pool_bypass = 0,
+    kpu_pool_max_2_s2 = 1,
+    kpu_pool_mean_2_s2 = 2,
+    kpu_pool_max_4_s4 = 3,
+    kpu_pool_mean_4_s4 = 4,
+    kpu_pool_left_top_2_s2 = 5,
+    kpu_pool_right_top_2_s2 = 6,
+    kpu_pool_left_top_4_s4 = 7,
+    kpu_pool_mean_2_s1 = 8,
+    kpu_pool_max_2_s1 = 9
+} kpu_pool_type_t;
+
+struct kpu_batchnorm_segment
+{
+    int32_t mul;
+    int32_t shift;
+    int32_t add;
+};
+
+inline bool operator==(const kpu_batchnorm_segment &lhs, const kpu_batchnorm_segment &rhs) noexcept
+{
+    return lhs.mul == rhs.mul && lhs.shift == rhs.shift && lhs.add == rhs.add;
+}
+
+inline bool operator!=(const kpu_batchnorm_segment &lhs, const kpu_batchnorm_segment &rhs) noexcept
+{
+    return !(lhs == rhs);
+}
+
+struct kpu_activation_segment
+{
+    int64_t start_x;
+    int32_t mul;
+    int32_t shift;
+    int32_t add;
+};
+
+inline bool operator==(const kpu_activation_segment &lhs, const kpu_activation_segment &rhs) noexcept
+{
+    return lhs.start_x == rhs.start_x && lhs.mul == rhs.mul
+        && lhs.shift == rhs.shift && lhs.add == rhs.add;
+}
+
+inline bool operator!=(const kpu_activation_segment &lhs, const kpu_activation_segment &rhs) noexcept
+{
+    return !(lhs == rhs);
+}
+
+using kpu_activation_table_t = std::array<kpu_activation_segment, 16>;
+
+using kpu_shape_t = std::array<uint32_t, 4>;
+
+enum class opcode_t : uint8_t
+{
+    kpu_upload,
+    kpu_download,
+    kpu_conv2d
+};
+
+struct kpu_upload_options
+{
+    opcode_t opcode = opcode_t::kpu_upload;
+    uint8_t reserved0[3];
+
+    memory_range input;
+    memory_range output;
+    kpu_shape_t in_shape;
+};
+
+struct kpu_download_options
+{
+    opcode_t opcode = opcode_t::kpu_download;
+    uint8_t reserved0[3];
+
+    memory_range input;
+    memory_range output;
+    kpu_shape_t in_shape;
+};
+
+struct kpu_conv2d_options
+{
+    opcode_t opcode = opcode_t::kpu_conv2d;
+    uint8_t reserved0[3];
+
+    memory_range weights;
+    memory_range batch_norm;
+    memory_range activation;
+    memory_range main_mem_output;
+    uint32_t batches;
+    kpu_layer_argument_t layer;
+};
+
+END_NS_NNCASE_RT_K210
--- a/lib/nncase/v1/include/nncase/runtime/model.h
+++ b/lib/nncase/v1/include/nncase/runtime/model.h
@ -0,0 +1,89 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "datatypes.h"
+#include <cassert>
+
+BEGIN_NS_NNCASE_RUNTIME
+
+NNCASE_INLINE_VAR constexpr size_t MAX_SECTION_NAME_LENGTH = 16;
+
+struct model_header
+{
+    uint32_t identifier;
+    uint32_t version;
+    uint32_t flags;
+    uint32_t alignment;
+    uint32_t modules;
+    uint32_t main_module;
+};
+
+struct module_header
+{
+    module_type_t type;
+    uint32_t size;
+    uint32_t mempools;
+    uint32_t inputs;
+    uint32_t outputs;
+    uint32_t sections;
+    uint32_t reserved0;
+};
+
+struct mempool_desc
+{
+    memory_location_t location;
+    uint32_t size;
+};
+
+struct section_header
+{
+    char name[MAX_SECTION_NAME_LENGTH];
+    uint32_t flags;
+    uint32_t start;
+    uint32_t size;
+    uint32_t reserved0;
+};
+
+NNCASE_INLINE_VAR constexpr uint32_t SECTION_MERGED_INTO_RDATA = 1;
+
+struct shape_header
+{
+    uint32_t size;
+
+    shape_header() = delete;
+    shape_header(shape_header &) = delete;
+    shape_header &operator=(shape_header &) = delete;
+
+    const uint32_t *begin() const noexcept
+    {
+        return reinterpret_cast<const uint32_t *>(reinterpret_cast<uintptr_t>(this) + sizeof(shape_header));
+    }
+
+    const uint32_t *end() const noexcept
+    {
+        return begin() + size;
+    }
+
+    uint32_t operator[](size_t index) const
+    {
+        assert(index < size);
+        return begin()[index];
+    }
+};
+
+NNCASE_INLINE_VAR constexpr uint32_t MODEL_IDENTIFIER = 'KMDL';
+NNCASE_INLINE_VAR constexpr uint32_t MODEL_VERSION = 5;
+
+END_NS_NNCASE_RUNTIME
--- a/lib/nncase/v1/include/nncase/runtime/nnil.h
+++ b/lib/nncase/v1/include/nncase/runtime/nnil.h
@ -0,0 +1,134 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "compiler_defs.h"
+#include "span_reader.h"
+#include <array>
+#include <cassert>
+
+BEGIN_NS_NNCASE_RUNTIME
+
+typedef enum _nnil_opcode
+{
+    nnil_nop = 0x00,
+    nnil_dup = 0x01,
+    nnil_pop = 0x02,
+    nnil_lda_0 = 0x03,
+    nnil_ldc_r4_0 = 0x04,
+    nnil_ldc_r4_1 = 0x05,
+    nnil_ldc_r4 = 0x06,
+    nnil_abs = 0x20,
+    nnil_ceil = 0x21,
+    nnil_cos = 0x22,
+    nnil_exp = 0x23,
+    nnil_floor = 0x24,
+    nnil_log = 0x25,
+    nnil_neg = 0x26,
+    nnil_rsqrt = 0x27,
+    nnil_sin = 0x28,
+    nnil_sqrt = 0x29,
+    nnil_square = 0x2A,
+    nnil_tanh = 0x2B,
+    nnil_bitwise_not = 0x2C,
+    nnil_logical_not = 0x2D,
+    nnil_round = 0x2E,
+    nnil_add = 0x40,
+    nnil_sub = 0x41,
+    nnil_mul = 0x42,
+    nnil_div = 0x43,
+    nnil_min = 0x44,
+    nnil_max = 0x45,
+    nnil_clamp = 0x80,
+    nnil_ret = 0xA0
+} nnil_opcode_t;
+
+typedef struct _nnil_ldc_r4
+{
+    float r4;
+} nnil_ldc_r4_t;
+
+typedef struct _nnil_op
+{
+    nnil_opcode_t opcode;
+
+    union
+    {
+        nnil_ldc_r4_t ldc_r4;
+    };
+} nnil_op_t;
+
+class nnil_reader
+{
+public:
+    nnil_reader(span_reader &reader)
+        : reader_(reader) { }
+
+    bool avail() const noexcept { return !reader_.empty(); }
+
+    nnil_op_t next()
+    {
+        assert(avail());
+        nnil_op_t op;
+        op.opcode = (nnil_opcode_t)reader_.read<uint8_t>();
+
+        switch (op.opcode)
+        {
+        case nnil_ldc_r4:
+            op.ldc_r4 = reader_.read_unaligned<nnil_ldc_r4_t>();
+            break;
+        default:
+            break;
+        }
+
+        return op;
+    }
+
+private:
+    span_reader &reader_;
+};
+
+class nnil_evalstack
+{
+public:
+    nnil_evalstack() noexcept
+        : top(0)
+    {
+    }
+
+    void push(float value)
+    {
+        assert(top < _stack.size());
+        _stack[top++] = value;
+    }
+
+    float pop()
+    {
+        assert(top > 0);
+        return _stack[--top];
+    }
+
+    void dup()
+    {
+        assert(top > 0);
+        _stack[top] = _stack[top - 1];
+        top++;
+    }
+
+private:
+    std::array<float, 64> _stack;
+    size_t top;
+};
+
+END_NS_NNCASE_RUNTIME
--- a/lib/nncase/v1/include/nncase/runtime/result.h
+++ b/lib/nncase/v1/include/nncase/runtime/result.h
@ -0,0 +1,373 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "compiler_defs.h"
+#include <functional>
+#include <mpark/variant.hpp>
+#include <system_error>
+#include <type_traits>
+
+namespace nncase
+{
+#define try_(x)                                            \
+    {                                                      \
+        auto v = (x);                                      \
+        if (!v.is_ok())                                    \
+            return nncase::err(std::move(v.unwrap_err())); \
+    }
+
+#define try_var(name, x)                                   \
+    typename decltype((x))::traits::ok_type name;          \
+    {                                                      \
+        auto v = (x);                                      \
+        if (v.is_ok())                                     \
+            name = std::move(v.unwrap());                  \
+        else                                               \
+            return nncase::err(std::move(v.unwrap_err())); \
+    }
+
+#define try_var_err(name, x, e)                         \
+    typename decltype((x))::traits::ok_type name;       \
+    {                                                   \
+        auto v = (x);                                   \
+        if (v.is_ok())                                  \
+        {                                               \
+            name = std::move(v.unwrap());               \
+        }                                               \
+        else                                            \
+        {                                               \
+            e = nncase::err(std::move(v.unwrap_err())); \
+            return;                                     \
+        }                                               \
+    }
+
+#define try_set(name, x)                                   \
+    {                                                      \
+        auto v = (x);                                      \
+        if (v.is_ok())                                     \
+            name = std::move(v.unwrap());                  \
+        else                                               \
+            return nncase::err(std::move(v.unwrap_err())); \
+    }
+
+template <class T>
+struct Ok
+{
+    constexpr Ok(T &&value)
+        : value(std::move(value)) { }
+
+    constexpr Ok(const T &value)
+        : value(value) { }
+
+    template <class... Args>
+    constexpr explicit Ok(mpark::in_place_t, Args &&...args)
+        : value(std::forward<Args>(args)...) { }
+
+    T value;
+};
+
+template <>
+struct Ok<void>
+{
+};
+
+struct Err
+{
+    template <class ErrCode, class = std::enable_if_t<std::is_error_condition_enum<ErrCode>::value>>
+    Err(ErrCode value)
+        : err(value) { }
+
+    Err(std::error_condition err)
+        : err(std::move(err)) { }
+
+    std::error_condition err;
+};
+
+inline constexpr Ok<void> ok()
+{
+    return {};
+}
+
+template <class T, class... Args>
+constexpr Ok<T> ok(Args &&...args)
+{
+    return Ok<T>(mpark::in_place, std::forward<Args>(args)...);
+}
+
+template <class T>
+constexpr Ok<std::decay_t<T>> ok(T &&value)
+{
+    return Ok<std::decay_t<T>>(std::forward<T>(value));
+}
+
+inline Err err(std::error_condition value) noexcept
+{
+    return Err(std::move(value));
+}
+
+template <class ErrCode, class = std::enable_if_t<std::is_error_condition_enum<ErrCode>::value>>
+Err err(ErrCode value)
+{
+    return err(std::error_condition(value));
+}
+
+template <class T>
+class NNCASE_NODISCARD result;
+
+namespace detail
+{
+    template <class T>
+    NNCASE_INLINE_VAR bool constexpr is_result_v = false;
+    template <class T>
+    NNCASE_INLINE_VAR bool constexpr is_result_v<result<T>> = true;
+
+    template <class T>
+    struct result_traits
+    {
+        static_assert(!is_result_v<T>, "Cannot use nested result");
+
+        using ok_type = T;
+    };
+
+    template <class T, class U, class Func>
+    class map_call_impl
+    {
+        result<U> operator()(Func &&func, Ok<T> &value) noexcept
+        {
+            return ok(func(value.value));
+        }
+    };
+
+    template <class T, class Func>
+    struct map_traits;
+
+    template <class U, class Func>
+    class map_call_void_impl
+    {
+        result<U> operator()(Func &&func) noexcept
+        {
+            return ok(func());
+        }
+    };
+
+    template <class Func>
+    struct map_traits<void, Func>
+    {
+        using U = invoke_result_t<Func>;
+        static_assert(!is_result_v<U>, "Cannot map a callback returning result, use and_then instead");
+
+        result<U> operator()(Func &&func, NNCASE_UNUSED Ok<void> &value) noexcept
+        {
+            return map_call_void_impl<U, Func>()(std::forward<Func>(func));
+        }
+    };
+
+    template <class T, class Func>
+    struct map_err_traits;
+
+    template <class T, class Func>
+    struct and_then_traits
+    {
+        using result_t = invoke_result_t<Func, T>;
+        using traits_t = typename result_t::traits;
+        using U = typename traits_t::ok_type;
+        static_assert(is_result_v<result_t>, "Cannot then a callback not returning result, use map instead");
+
+        result_t operator()(Func &&func, Ok<T> &value) noexcept
+        {
+            return func(value.value);
+        }
+    };
+
+    template <class Func>
+    struct and_then_traits<void, Func>
+    {
+        using result_t = invoke_result_t<Func>;
+        using traits_t = typename result_t::traits;
+        using U = typename traits_t::ok_type;
+        static_assert(is_result_v<result_t>, "Cannot then a callback not returning result, use map instead");
+
+        result_t operator()(Func &&func, NNCASE_UNUSED Ok<void> &value) noexcept
+        {
+            return func();
+        }
+    };
+
+    template <class T>
+    struct unwrap_impl
+    {
+        T &operator()(Ok<T> &value) noexcept
+        {
+            return value.value;
+        }
+
+        T &&operator()(Ok<T> &&value) noexcept
+        {
+            return std::move(value.value);
+        }
+    };
+
+    template <>
+    struct unwrap_impl<void>
+    {
+        void operator()(NNCASE_UNUSED Ok<void> &value) noexcept
+        {
+        }
+
+        void operator()(NNCASE_UNUSED Ok<void> &&value) noexcept
+        {
+        }
+    };
+}
+
+template <class T>
+class NNCASE_NODISCARD result
+{
+public:
+    using traits = detail::result_traits<T>;
+
+    constexpr result(Ok<T> value)
+        : ok_or_err_(std::move(value)) { }
+
+    result(Err err)
+        : ok_or_err_(std::move(err)) { }
+
+    constexpr bool is_ok() const noexcept { return ok_or_err_.index() == 0; }
+    constexpr bool is_err() const noexcept { return ok_or_err_.index() == 1; }
+
+    constexpr decltype(auto) unwrap() noexcept
+    {
+        if (is_ok())
+            return detail::unwrap_impl<T>()(value());
+        else
+            std::terminate();
+    }
+
+    constexpr decltype(auto) unwrap_or_throw() &
+    {
+        if (is_ok())
+            return detail::unwrap_impl<T>()(value());
+        else
+            throw std::runtime_error(unwrap_err().message());
+    }
+
+    constexpr decltype(auto) unwrap_or_throw() &&
+    {
+        if (is_ok())
+            return detail::unwrap_impl<T>()(std::move(value()));
+        else
+            throw std::runtime_error(unwrap_err().message());
+    }
+
+    constexpr std::error_condition &unwrap_err() noexcept
+    {
+        if (is_ok())
+            std::terminate();
+        else
+            return err().err;
+    }
+
+    constexpr auto expect(NNCASE_UNUSED gsl::cstring_span message) noexcept
+    {
+        if (is_ok())
+            return detail::unwrap_impl<T>()(value());
+        else
+            std::terminate();
+    }
+
+    template <class Func, class Traits = detail::map_traits<T, Func>>
+    constexpr typename Traits::result_t map(Func &&func) noexcept
+    {
+        if (is_ok())
+            return Traits()(std::forward<Func>(func), value());
+        else
+            return err();
+    }
+
+    template <class Func, class Traits = detail::map_err_traits<T, Func>>
+    constexpr typename Traits::result_t map_err(Func &&func) noexcept
+    {
+        if (is_ok())
+            return value();
+        else
+            return Traits()(std::forward<Func>(func), err());
+    }
+
+    template <class Func, class Traits = detail::and_then_traits<T, Func>>
+    constexpr typename Traits::result_t and_then(Func &&func) noexcept
+    {
+        if (is_ok())
+            return Traits()(std::forward<Func>(func), value());
+        else
+            return err();
+    }
+
+private:
+    constexpr Ok<T> &&value() &&noexcept { return mpark::get<Ok<T>>(ok_or_err_); }
+    constexpr Ok<T> &value() &noexcept { return mpark::get<Ok<T>>(ok_or_err_); }
+    constexpr Err &err() noexcept { return mpark::get<Err>(ok_or_err_); }
+
+private:
+    mpark::variant<Ok<T>, Err> ok_or_err_;
+};
+
+namespace detail
+{
+    template <class T, class Func>
+    struct map_traits
+    {
+        using U = invoke_result_t<Func, T>;
+        static_assert(!is_result_v<U>, "Cannot map a callback returning result, use and_then instead");
+        using result_t = result<U>;
+
+        result<U> operator()(Func &&func, Ok<T> &value) noexcept
+        {
+            return map_call_impl<T, U, Func>()(std::forward<Func>(func), value);
+        }
+    };
+
+    template <class T, class Func>
+    struct map_err_traits
+    {
+        using U = invoke_result_t<Func, Err>;
+        static_assert(!is_result_v<U>, "Cannot map a callback returning result, use and_then instead");
+
+        result<U> operator()(Func &&func, Err &value) noexcept
+        {
+            return err(func(value.err));
+        }
+    };
+
+    template <class T, class Func>
+    class map_call_impl<T, void, Func>
+    {
+        result<void> operator()(Func &&func, Ok<T> &value) noexcept
+        {
+            func(value.value);
+            return ok();
+        }
+    };
+
+    template <class Func>
+    class map_call_void_impl<void, Func>
+    {
+        result<void> operator()(Func &&func) noexcept
+        {
+            func();
+            return ok();
+        }
+    };
+}
+}
--- a/lib/nncase/v1/include/nncase/runtime/runtime_loader.h
+++ b/lib/nncase/v1/include/nncase/runtime/runtime_loader.h
@ -0,0 +1,36 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <memory>
+#include <nncase/runtime/error.h>
+#include <nncase/runtime/model.h>
+#include <nncase/runtime/result.h>
+#include <nncase/runtime/runtime_module.h>
+
+BEGIN_NS_NNCASE_RUNTIME
+
+typedef void (*rt_module_activator_t)(result<std::unique_ptr<runtime_module>> &result);
+
+#define RUNTIME_MODULE_ACTIVATOR_NAME create_runtime_module
+
+struct runtime_registration
+{
+    module_type_t id;
+    rt_module_activator_t activator;
+};
+
+extern runtime_registration builtin_runtimes[];
+
+END_NS_NNCASE_RUNTIME
--- a/lib/nncase/v1/include/nncase/runtime/runtime_module.h
+++ b/lib/nncase/v1/include/nncase/runtime/runtime_module.h
@ -0,0 +1,94 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "model.h"
+#include "result.h"
+#include "runtime_tensor.h"
+
+BEGIN_NS_NNCASE_RUNTIME
+
+class interpreter;
+
+struct NNCASE_API runtime_module_init_context
+{
+    virtual bool is_section_pinned() const noexcept = 0;
+    virtual interpreter &interp() noexcept = 0;
+    virtual const module_header &header() noexcept = 0;
+    virtual gsl::span<const gsl::byte> section(const char *name) noexcept = 0;
+};
+
+class NNCASE_API runtime_module
+{
+private:
+    struct inout_tensor_info
+    {
+        runtime_shape_t shape;
+        runtime_shape_t strides;
+        memory_range range;
+        runtime_tensor bind_tensor;
+        runtime_tensor staging_tensor;
+        runtime_tensor device_tensor;
+    };
+
+public:
+    static result<std::unique_ptr<runtime_module>> create(const module_type_t &type);
+
+    runtime_module() = default;
+    runtime_module(runtime_module &) = delete;
+    virtual ~runtime_module() = default;
+
+    result<void> initialize(const module_header &header, interpreter &interp) noexcept;
+    virtual result<void> initialize_inter_modules(interpreter &interp) noexcept;
+    const module_type_t &type() const noexcept;
+
+    interpreter &interp() const noexcept { return *interp_; }
+
+    uint32_t mempools_size() const noexcept;
+    const mempool_desc &mempool(size_t index) const noexcept;
+    mempool_desc mempool(memory_location_t location) const noexcept;
+
+    uint32_t inputs_size() const noexcept;
+    const runtime_shape_t &input_shape(size_t index) const noexcept;
+    const memory_range &input_desc(size_t index) const noexcept;
+    result<runtime_tensor> input_tensor(size_t index) noexcept;
+    result<void> input_tensor(size_t index, runtime_tensor tensor) noexcept;
+
+    uint32_t outputs_size() const noexcept;
+    const runtime_shape_t &output_shape(size_t index) const noexcept;
+    const memory_range &output_desc(size_t index) const noexcept;
+    result<runtime_tensor> output_tensor(size_t index) noexcept;
+    result<void> output_tensor(size_t index, runtime_tensor tensor) noexcept;
+
+    result<void> run() noexcept;
+
+protected:
+    virtual result<void> initialize_core(runtime_module_init_context &context) noexcept = 0;
+    virtual result<runtime_tensor> allocate_input_tensor(size_t index) noexcept = 0;
+    virtual result<runtime_tensor> allocate_output_tensor(size_t index) noexcept = 0;
+    virtual result<void> validate_input_tensor(size_t index, runtime_tensor tensor) noexcept = 0;
+    virtual result<void> validate_output_tensor(size_t index, runtime_tensor tensor) noexcept = 0;
+    result<runtime_tensor> device_input_tensor(size_t index) noexcept;
+    result<runtime_tensor> device_output_tensor(size_t index) noexcept;
+    virtual result<void> run_core() noexcept = 0;
+
+private:
+    module_header header_;
+    std::vector<mempool_desc> mempools_;
+    std::vector<inout_tensor_info> input_tensors_;
+    std::vector<inout_tensor_info> output_tensors_;
+    interpreter *interp_ = nullptr;
+};
+
+END_NS_NNCASE_RUNTIME
--- a/lib/nncase/v1/include/nncase/runtime/runtime_op_utility.h
+++ b/lib/nncase/v1/include/nncase/runtime/runtime_op_utility.h
@ -0,0 +1,259 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "datatypes.h"
+#include "result.h"
+
+BEGIN_NS_NNCASE_RUNTIME
+
+inline constexpr size_t get_bytes(datatype_t type)
+{
+    return nncase::detail::datatype_bytes(type);
+}
+
+inline size_t compute_size(const runtime_shape_t &shape)
+{
+    return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
+}
+
+inline size_t get_bytes(datatype_t type, const runtime_shape_t &shape)
+{
+    return compute_size(shape) * get_bytes(type);
+}
+
+inline size_t compute_size(const runtime_shape_t &shape, const runtime_shape_t &strides)
+{
+    size_t max_stride = 0, max_shape = 0;
+    for (size_t i = 0; i < shape.size(); i++)
+    {
+        if ((shape[i] == 1 ? 0 : strides[i]) > max_stride)
+        {
+            max_stride = strides[i];
+            max_shape = shape[i];
+        }
+    }
+    size_t size = max_stride * max_shape;
+    return size ? size : 1;
+}
+
+inline size_t get_bytes(datatype_t type, const runtime_shape_t &shape, const runtime_shape_t &strides)
+{
+    return compute_size(shape, strides) * get_bytes(type);
+}
+
+namespace detail
+{
+template <class shape_type, class strides_type>
+inline void adapt_strides(const shape_type &shape, strides_type &strides,
+    std::nullptr_t, typename strides_type::size_type i) noexcept
+{
+    if (shape[i] == 1)
+    {
+        strides[i] = 0;
+    }
+}
+
+template <class shape_type, class strides_type, class bs_ptr>
+inline std::size_t compute_strides(const shape_type &shape,
+    strides_type &strides, bs_ptr bs)
+{
+    using strides_value_type = typename std::decay_t<strides_type>::value_type;
+    strides_value_type data_size = 1;
+    for (std::size_t i = shape.size(); i != 0; --i)
+    {
+        strides[i - 1] = data_size;
+        data_size = strides[i - 1] * static_cast<strides_value_type>(shape[i - 1]);
+        adapt_strides(shape, strides, bs, i - 1);
+    }
+    return static_cast<std::size_t>(data_size);
+}
+}
+
+template <class shape_type, class strides_type>
+inline std::size_t compute_strides(const shape_type &shape, strides_type &strides)
+{
+    return detail::compute_strides(shape, strides, nullptr);
+}
+
+inline runtime_shape_t get_default_strides(const runtime_shape_t &shape)
+{
+    runtime_shape_t strides(shape.size());
+    compute_strides(shape, strides);
+    return strides;
+}
+
+template <class TShape>
+TShape convert_shape_type(const TShape &shape, datatype_t src, datatype_t dest)
+{
+    const auto src_size = get_bytes(src);
+    const auto dest_size = get_bytes(dest);
+
+    TShape new_shape = shape;
+    if (!new_shape.empty())
+    {
+        auto &v = new_shape.back();
+        v = new_shape.back() * src_size / dest_size;
+    }
+
+    return new_shape;
+}
+
+template <class TShape>
+result<TShape> convert_strides_type(const TShape &strides, datatype_t src, datatype_t dest)
+{
+    const auto src_size = get_bytes(src);
+    const auto dest_size = get_bytes(dest);
+
+    if (src_size == dest_size)
+        return ok(strides);
+
+    TShape new_strides = strides;
+    // 1. Except last dim
+    for (size_t i = 0; i < new_strides.size() - 1; i++)
+    {
+        auto &v = new_strides[i];
+        if (v == 0)
+            v = 1;
+        v = v * src_size / dest_size;
+    }
+
+    // 2. Last dim
+    if (!new_strides.empty())
+    {
+        // 2.1. If last dim is not 0 or 1, unsupported
+        auto last_dim = new_strides.back();
+        if (last_dim != 0 || last_dim != 1)
+            return err(std::errc::not_supported);
+    }
+
+    return ok(new_strides);
+}
+
+template <int32_t Bits, class T>
+uint8_t count_leading_zeros(T value)
+{
+    uint8_t num_zeroes = 0;
+    for (int32_t i = Bits - 1; i >= 0; i--)
+    {
+        if ((value & (1ULL << i)) == 0)
+            ++num_zeroes;
+        else
+            break;
+    }
+
+    return num_zeroes;
+}
+
+template <class T = uint64_t>
+inline T bit_mask(uint8_t shift)
+{
+    return (T(1) << shift) - 1;
+}
+
+template <class T, bool Banker = false>
+T carry_shift(T value, int32_t shift)
+{
+    if (shift > 0)
+    {
+        if (Banker)
+        {
+            T result;
+            // Sign |  Int (T - shift - 1 bits) | Frac (shift bits)
+            //  S      IIII                       FFF
+            auto integral = value >> shift;
+            auto fractional = value & bit_mask(shift);
+            auto sign = value < 0 ? -1 : 1;
+            auto half = size_t(1) << (shift - 1);
+
+            // frac < 0.5
+            if (fractional < half)
+            {
+                return integral;
+            }
+            // frac > 0.5
+            else if (fractional > half)
+            {
+                return integral + sign;
+            }
+            // frac == 0.5
+            else
+            {
+                // odd
+                if (integral & 1)
+                    return integral + sign;
+                // even
+                else
+                    return integral;
+            }
+
+            return result;
+        }
+        else
+        {
+            value += T(1) << (shift - 1);
+            value >>= shift;
+        }
+    }
+    else if (shift < 0)
+    {
+        value = value << (-shift);
+    }
+
+    return value;
+}
+
+template <bool Banker = false>
+inline int32_t mul_and_carry_shift(int32_t value, int32_t mul, int32_t shift)
+{
+    return (int32_t)carry_shift<int64_t, Banker>((int64_t)value * mul, shift);
+}
+
+template <class T>
+inline T clamp(T value, T min, T max)
+{
+    return std::min(max, std::max(value, min));
+}
+
+template <uint8_t Bits>
+inline int32_t clamp(int32_t value)
+{
+    auto min = std::numeric_limits<int32_t>::lowest() >> (32 - Bits);
+    auto max = std::numeric_limits<int32_t>::max() >> (32 - Bits);
+    return clamp(value, min, max);
+}
+
+template <class TShape>
+inline bool is_contiguous(const TShape &shape, const TShape &strides)
+{
+    return get_default_strides(shape) == strides;
+}
+
+inline int get_last_not_contiguous_index(const runtime_shape_t &strides, const runtime_shape_t &default_strides)
+{
+    for (int i = strides.size() - 1; i >= 0; --i)
+    {
+        if (strides[i] != default_strides[i])
+        {
+            return i + 1;
+        }
+    }
+    return -1;
+}
+
+template<size_t A, size_t B>
+constexpr auto is_not_equal = std::integral_constant<bool, std::not_equal_to<size_t> {}(A, B)> {};
+
+struct DefaultCallable {};
+END_NS_NNCASE_RUNTIME
--- a/lib/nncase/v1/include/nncase/runtime/runtime_tensor.h
+++ b/lib/nncase/v1/include/nncase/runtime/runtime_tensor.h
@ -0,0 +1,148 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "model.h"
+#include "result.h"
+#include <functional>
+#include <memory>
+
+BEGIN_NS_NNCASE_RUNTIME
+
+struct runtime_tensor_type
+{
+    const char *data;
+
+    explicit runtime_tensor_type(const char *data) noexcept
+        : data(data)
+    {
+    }
+
+    runtime_tensor_type(runtime_tensor_type &) = delete;
+    runtime_tensor_type &operator=(runtime_tensor_type &) = delete;
+};
+
+inline bool operator==(runtime_tensor_type &lhs, runtime_tensor_type &rhs) noexcept
+{
+    return &lhs == &rhs;
+}
+
+inline bool operator!=(runtime_tensor_type &lhs, runtime_tensor_type &rhs) noexcept
+{
+    return &lhs != &rhs;
+}
+
+namespace detail
+{
+class runtime_tensor_impl;
+class host_runtime_tensor_impl;
+}
+
+class NNCASE_API runtime_tensor
+{
+public:
+    runtime_tensor() noexcept;
+    runtime_tensor(std::shared_ptr<detail::runtime_tensor_impl> impl) noexcept;
+
+    datatype_t datatype() const noexcept;
+    const runtime_shape_t &shape() const noexcept;
+    const runtime_shape_t &strides() const noexcept;
+    runtime_tensor_type &tensor_type() const noexcept;
+    bool empty() const noexcept;
+    bool is_host() const noexcept;
+    bool is_contiguous() const noexcept;
+
+    detail::runtime_tensor_impl *impl() noexcept { return impl_.get(); }
+    const detail::runtime_tensor_impl *impl() const noexcept { return impl_.get(); }
+
+    bool can_copy_to_without_staging(const runtime_tensor &dest) const noexcept;
+    result<void> copy_to(runtime_tensor &dest) noexcept;
+    result<runtime_tensor> as_host() noexcept;
+
+    void reset() noexcept;
+
+private:
+    std::shared_ptr<detail::runtime_tensor_impl> impl_;
+};
+
+NNCASE_API bool operator==(const runtime_tensor &lhs, const runtime_tensor &rhs) noexcept;
+NNCASE_API bool operator!=(const runtime_tensor &lhs, const runtime_tensor &rhs) noexcept;
+
+namespace host_runtime_tensor
+{
+typedef enum memory_pool_
+{
+    pool_cpu_only,
+    pool_shared
+} memory_pool_t;
+
+typedef enum sync_op_
+{
+    sync_invalidate,
+    sync_write_back
+} sync_op_t;
+
+typedef enum map_access_
+{
+    map_none = 0,
+    map_read = 1,
+    map_write = 2,
+    map_read_write = 3
+} map_access_t;
+
+DEFINE_ENUM_BITMASK_OPERATORS(map_access_t)
+
+class NNCASE_API mapped_buffer
+{
+public:
+    mapped_buffer() noexcept;
+    mapped_buffer(detail::host_runtime_tensor_impl &impl, map_access_t access, uintptr_t address, size_t size_bytes) noexcept;
+    mapped_buffer(mapped_buffer &&other) noexcept;
+    mapped_buffer(const mapped_buffer &) = delete;
+    ~mapped_buffer();
+
+    mapped_buffer &operator=(mapped_buffer &&) noexcept;
+    mapped_buffer &operator=(const mapped_buffer &) = delete;
+
+    result<void> unmap() noexcept;
+
+    gsl::span<gsl::byte> buffer() const noexcept
+    {
+        return { reinterpret_cast<gsl::byte *>(address_), size_bytes_ };
+    }
+
+private:
+    detail::host_runtime_tensor_impl *impl_;
+    map_access_t access_;
+    uintptr_t address_;
+    size_t size_bytes_;
+};
+
+typedef std::function<void(gsl::byte *)> data_deleter_t;
+
+NNCASE_API runtime_tensor_type &tensor_type() noexcept;
+NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept;
+NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, gsl::span<gsl::byte> data, bool copy, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept;
+NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, gsl::span<gsl::byte> data, data_deleter_t data_deleter, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept;
+NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept;
+NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, gsl::span<gsl::byte> data, bool copy, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept;
+NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, gsl::span<gsl::byte> data, data_deleter_t data_deleter, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept;
+NNCASE_API result<memory_pool_t> memory_pool(const runtime_tensor &tensor) noexcept;
+NNCASE_API result<mapped_buffer> map(runtime_tensor &tensor, map_access_t access) noexcept;
+NNCASE_API result<void> sync(runtime_tensor &tensor, sync_op_t op, bool force = false) noexcept;
+}
+
+namespace hrt = host_runtime_tensor;
+
+END_NS_NNCASE_RUNTIME
--- a/lib/nncase/v1/include/nncase/runtime/runtime_tensor_impl.h
+++ b/lib/nncase/v1/include/nncase/runtime/runtime_tensor_impl.h
@ -0,0 +1,49 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "runtime_tensor.h"
+
+BEGIN_NS_NNCASE_RUNTIME
+
+namespace detail
+{
+class NNCASE_API runtime_tensor_impl
+{
+public:
+    virtual ~runtime_tensor_impl() = default;
+
+    virtual datatype_t datatype() const noexcept = 0;
+    virtual const runtime_shape_t &shape() const noexcept = 0;
+    virtual const runtime_shape_t &strides() const noexcept = 0;
+    virtual runtime_tensor_type &tensor_type() const noexcept = 0;
+    bool is_host() const noexcept;
+    bool is_contiguous() const noexcept;
+
+    bool can_copy_to_without_staging(const runtime_tensor &dest) const noexcept;
+    result<void> copy_to(runtime_tensor &dest) noexcept;
+    result<runtime_tensor> copy_as_host() noexcept;
+
+    virtual bool can_copy_from_different_type(const runtime_tensor_impl &src) const noexcept;
+    virtual bool can_copy_to_different_type(const runtime_tensor_impl &dest) const noexcept;
+
+    virtual result<void> copy_to_same_type(runtime_tensor_impl &dest) noexcept;
+    virtual result<void> copy_from_different_type(runtime_tensor_impl &src) noexcept;
+    virtual result<void> copy_to_different_type(runtime_tensor_impl &dest) noexcept;
+    virtual result<void> copy_from_host(runtime_tensor_impl &src) noexcept;
+    virtual result<void> copy_to_host(runtime_tensor_impl &dest) noexcept;
+};
+}
+
+END_NS_NNCASE_RUNTIME
--- a/lib/nncase/v1/include/nncase/runtime/shared_runtime_tensor.h
+++ b/lib/nncase/v1/include/nncase/runtime/shared_runtime_tensor.h
@ -0,0 +1,31 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "runtime_tensor_impl.h"
+
+BEGIN_NS_NNCASE_RUNTIME
+
+namespace detail
+{
+class host_runtime_tensor_impl;
+}
+
+END_NS_NNCASE_RUNTIME
+
+#ifndef NNCASE_SHARED_RUNTIME_TENSOR_PLATFORM_HEADER
+#include "shared_runtime_tensor.platform.h"
+#else
+#include NNCASE_SHARED_RUNTIME_TENSOR_PLATFORM_HEADER
+#endif
--- a/lib/nncase/v1/include/nncase/runtime/shared_runtime_tensor.platform.h
+++ b/lib/nncase/v1/include/nncase/runtime/shared_runtime_tensor.platform.h
@ -0,0 +1,44 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "runtime_tensor_impl.h"
+
+BEGIN_NS_NNCASE_RUNTIME
+
+namespace detail
+{
+struct host_memory_block;
+
+struct NNCASE_API physical_memory_block
+{
+    uintptr_t physical_address;
+    bool owned;
+
+    physical_memory_block() noexcept;
+    ~physical_memory_block();
+    physical_memory_block(const physical_memory_block &) = delete;
+    physical_memory_block(physical_memory_block &&other) noexcept;
+    physical_memory_block &operator=(const physical_memory_block &) = delete;
+    physical_memory_block &operator=(physical_memory_block &&other) noexcept;
+
+    result<void> free() noexcept;
+
+    static result<void> acknowledge(host_memory_block &block) noexcept;
+    static result<void> allocate(host_memory_block &block) noexcept;
+    static result<void> sync(host_memory_block &block, host_runtime_tensor::sync_op_t op) noexcept;
+};
+}
+
+END_NS_NNCASE_RUNTIME
--- a/lib/nncase/v1/include/nncase/runtime/small_vector.hpp
+++ b/lib/nncase/v1/include/nncase/runtime/small_vector.hpp
--- a/lib/nncase/v1/include/nncase/runtime/span_reader.h
+++ b/lib/nncase/v1/include/nncase/runtime/span_reader.h
@ -0,0 +1,142 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "compiler_defs.h"
+#include <cstring>
+#include <gsl/gsl-lite.hpp>
+
+BEGIN_NS_NNCASE_RUNTIME
+
+class span_reader
+{
+public:
+    span_reader(gsl::span<const gsl::byte> span)
+        : span_(span)
+    {
+    }
+
+    bool empty() const noexcept { return span_.empty(); }
+    size_t avail() const noexcept { return span_.size_bytes(); }
+
+    template <class T>
+    T read()
+    {
+        auto value = *reinterpret_cast<const T *>(span_.data());
+        advance(sizeof(T));
+        return value;
+    }
+
+    template <class T>
+    T read_unaligned()
+    {
+        alignas(T) uint8_t storage[sizeof(T)];
+        std::memcpy(storage, span_.data(), sizeof(T));
+        advance(sizeof(T));
+        return *reinterpret_cast<const T *>(storage);
+    }
+
+    template <class T>
+    void read(T &value)
+    {
+        value = *reinterpret_cast<const T *>(span_.data());
+        advance(sizeof(T));
+    }
+
+    template <class T>
+    void read_span(gsl::span<const T> &span, size_t size)
+    {
+        span = { reinterpret_cast<const T *>(span_.data()), size };
+        advance(sizeof(T) * size);
+    }
+
+    template <class T = gsl::byte>
+    gsl::span<const T> read_span(size_t size)
+    {
+        gsl::span<const T> span(reinterpret_cast<const T *>(span_.data()), size);
+        advance(sizeof(T) * size);
+        return span;
+    }
+
+    void read_avail(gsl::span<const gsl::byte> &span)
+    {
+        span = span_;
+        span_ = {};
+    }
+
+    gsl::span<const gsl::byte> read_avail()
+    {
+        auto span = span_;
+        span_ = {};
+        return span;
+    }
+
+    gsl::span<const gsl::byte> peek_avail()
+    {
+        return span_;
+    }
+
+    template <class T>
+    T peek()
+    {
+        auto value = *reinterpret_cast<const T *>(span_.data());
+        return value;
+    }
+
+    template <class T>
+    T peek_unaligned()
+    {
+        T value;
+        std::memcpy(&value, span_.data(), sizeof(T));
+        return value;
+    }
+
+    template <class T>
+    T peek_unaligned_with_offset(size_t offset)
+    {
+        T value;
+        std::memcpy(&value, span_.data() + offset, sizeof(T));
+        return value;
+    }
+
+    template <class T>
+    const T *get_ref()
+    {
+        auto ptr = reinterpret_cast<const T *>(span_.data());
+        advance(sizeof(T));
+        return ptr;
+    }
+
+    template <class T>
+    void get_ref(const T *&ptr)
+    {
+        ptr = get_ref<T>();
+    }
+
+    void skip(size_t count)
+    {
+        advance(count);
+    }
+
+private:
+    void advance(size_t count)
+    {
+        span_ = span_.subspan(count);
+    }
+
+private:
+    gsl::span<const gsl::byte> span_;
+};
+
+END_NS_NNCASE_RUNTIME
--- a/lib/nncase/v1/include/nncase/runtime/stackvm/kernel_context.h
+++ b/lib/nncase/v1/include/nncase/runtime/stackvm/kernel_context.h
@ -0,0 +1,25 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <nncase/kernels/kernel_context.h>
+
+BEGIN_NS_NNCASE_RT_STACKVM
+
+struct NNCASE_API stackvm_kernel_context : public kernels::kernel_context
+{
+    int num_threads_ = 4;
+};
+
+END_NS_NNCASE_RT_STACKVM
--- a/lib/nncase/v1/include/nncase/runtime/stackvm/op_reader.h
+++ b/lib/nncase/v1/include/nncase/runtime/stackvm/op_reader.h
--- a/lib/nncase/v1/include/nncase/runtime/stackvm/opcode.h
+++ b/lib/nncase/v1/include/nncase/runtime/stackvm/opcode.h
--- a/lib/nncase/v1/include/nncase/runtime/stackvm/runtime_module.h
+++ b/lib/nncase/v1/include/nncase/runtime/stackvm/runtime_module.h
@ -0,0 +1,24 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../runtime_module.h"
+
+BEGIN_NS_NNCASE_RT_STACKVM
+
+NNCASE_INLINE_VAR constexpr module_type_t stackvm_module_type = to_module_type("stackvm");
+
+NNCASE_API result<std::unique_ptr<runtime_module>> create_stackvm_runtime_module();
+
+END_NS_NNCASE_RT_STACKVM
--- a/lib/nncase/v1/include/nncase/version.h
+++ b/lib/nncase/v1/include/nncase/version.h
@ -0,0 +1,17 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#define NNCASE_VERSION "1.0.0"
+#define NNCASE_GIT_DESC "15b0a90-dirty"
--- a/lib/nncase/v1/lib/cmake/nncaseruntime/nncase_rt_modules_k210Config.cmake
+++ b/lib/nncase/v1/lib/cmake/nncaseruntime/nncase_rt_modules_k210Config.cmake
@ -0,0 +1 @@
+include(${CMAKE_CURRENT_LIST_DIR}/nncase_rt_modules_k210Targets.cmake)
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`include(${CMAKE_CURRENT_LIST_DIR}/nncase_rt_modules_k210Targets.cmake)`