Update nncaseruntime to v1.0.0beta2

2021-09-17 10:59:50 +08:00 · 2021-09-17 10:59:50 +08:00 · cd966ff9bc
parent 9b61893150
commit cd966ff9bc
33 changed files with 1058 additions and 134 deletions
--- a/lib/nncase/v1/include/nncase/kernels/cpu/optimized/tensor_compute.h
+++ b/lib/nncase/v1/include/nncase/kernels/cpu/optimized/tensor_compute.h
@ -34,6 +34,15 @@ NNCASE_API result<void> copy(datatype_t type, const gsl::byte *src, gsl::byte *d
    const runtime_shape_t &shape, const runtime_shape_t &src_strides, const runtime_shape_t &dest_strides,
    int dims_offset, copy_impl_select impl_select, kernel_context &context) noexcept;

+NNCASE_API result<void> conv2d(const float *input, const float *weights, const float *bias, float *output,
+    const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &w_shape, const runtime_shape_t &w_strides,
+    const runtime_shape_t &bias_strides, const runtime_shape_t &out_strides, const padding &padding_h, const padding &padding_w,
+    int32_t groups, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, value_range<float> fused_activation, kernel_context &context) noexcept;
+
+NNCASE_API result<void> dequantize(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
+    const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, float scale, float bias,
+    kernel_context &context) noexcept;
+
 NNCASE_API result<void> gather(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, const runtime_shape_t &out_shape,
    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const int32_t *indices, const runtime_shape_t &indices_shape, size_t axis,
    kernel_context &context = default_kernel_context()) noexcept;
@ -45,6 +54,10 @@ NNCASE_API result<void> gather_nd(datatype_t type, const gsl::byte *input, gsl::
 NNCASE_API result<void> onehot(datatype_t type, const int32_t *indices, gsl::byte *output, const runtime_shape_t &indices_shape, const runtime_shape_t &out_shape,
    const runtime_shape_t &out_strides, gsl::byte *depth, gsl::byte *off_value, gsl::byte *on_value, size_t axis, onehot_mode_t mode, kernel_context &context) noexcept;

+NNCASE_API result<void> quantize(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
+    const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, float scale, float bias,
+    kernel_context &context) noexcept;
+
 NNCASE_API result<void> slice(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const runtime_shape_t &begins, const runtime_shape_t &ends, const runtime_axis_t &strides,
    kernel_context &context = default_kernel_context()) noexcept;
--- a/lib/nncase/v1/include/nncase/kernels/cpu/reference/runtime_types.h
+++ b/lib/nncase/v1/include/nncase/kernels/cpu/reference/runtime_types.h
@ -37,8 +37,8 @@ BEGIN_NS_NNCASE_KERNELS_CPU_REF

 namespace detail
 {
-template <class Callable>
-result<void> apply_impl(Callable &&callable, runtime_shape_t index_prefix, runtime_shape_t::const_iterator index_begin, runtime_shape_t::const_iterator index_end) noexcept
+template <class TShape, class Callable, class TIt>
+result<void> apply_impl(Callable &&callable, TShape index_prefix, TIt index_begin, TIt index_end) noexcept
 {
    const auto head = *index_begin++;
    index_prefix.push_back(0);
@ -63,10 +63,10 @@ result<void> apply_impl(Callable &&callable, runtime_shape_t index_prefix, runti
 }
 }

-template <class Callable>
-result<void> apply(const runtime_shape_t &shape, Callable &&callable) noexcept
+template <class TShape, class Callable>
+result<void> apply(const TShape &shape, Callable &&callable) noexcept
 {
-    return detail::apply_impl(std::forward<Callable>(callable), runtime_shape_t(), shape.cbegin(), shape.cend());
+    return detail::apply_impl(std::forward<Callable>(callable), TShape(), shape.cbegin(), shape.cend());
 }

 END_NS_NNCASE_KERNELS_CPU_REF
--- a/lib/nncase/v1/include/nncase/kernels/k210/k210_kernels.h
+++ b/lib/nncase/v1/include/nncase/kernels/k210/k210_kernels.h
@ -152,9 +152,7 @@ void kpu_conv2d(const uint8_t *input, int64_t *workspace, uint8_t *output, const
            for (size_t i = 0; i < channel_size; i++)
            {
                auto value = (*src_it++ * bn.mul >> bn.shift) + bn.add;
-                auto &seg = *std::find_if(activation.rbegin(), activation.rend(), [value](const runtime::k210::kpu_activation_segment &seg) {
-                    return value > seg.start_x;
-                });
+                auto &seg = *std::find_if(activation.rbegin(), activation.rend(), [value](const runtime::k210::kpu_activation_segment &seg) { return value > seg.start_x; });
                auto act_value = runtime::carry_shift<int64_t, true>((value - seg.start_x) * seg.mul, seg.shift) + seg.add;
                *out_it++ = (uint8_t)kernels::detail::clamp(act_value, int64_t(0), int64_t(255));
            }
--- a/lib/nncase/v1/include/nncase/kernels/kernel_utils.h
+++ b/lib/nncase/v1/include/nncase/kernels/kernel_utils.h
@ -53,7 +53,7 @@ template <class TShape>
 size_t offset(const TShape &strides, const TShape &index)
 {
    assert(strides.size() == index.size());
-    return element_offset<size_t>(strides, index.begin(), index.end());
+    return kernels::element_offset<size_t>(strides, index.begin(), index.end());
 }

 template <class TShape>
--- a/lib/nncase/v1/include/nncase/runtime/bfloat16.h
+++ b/lib/nncase/v1/include/nncase/runtime/bfloat16.h
@ -22,11 +22,6 @@

 namespace nncase
 {
-struct half
-{
-    uint16_t value;
-};
-
 struct from_raw_t
 {
    explicit from_raw_t() = default;
@ -165,7 +160,17 @@ public:

    static constexpr bfloat16 nan() noexcept
    {
-        return from_raw(NAN_VALUE);
+        return from_raw(0x7fc0);
+    }
+
+    static constexpr bfloat16 quiet_NaN() noexcept
+    {
+        return from_raw(0x7fc0);
+    }
+
+    static constexpr bfloat16 signaling_NaN() noexcept
+    {
+        return from_raw(0x7f81);
    }

    static constexpr bfloat16 infinity() noexcept
@ -291,12 +296,12 @@ struct numeric_limits<nncase::bfloat16>

    NNCASE_UNUSED static constexpr nncase::bfloat16 quiet_NaN() noexcept
    {
-        return nncase::bfloat16::nan();
+        return nncase::bfloat16::quiet_NaN();
    }

    NNCASE_UNUSED static constexpr nncase::bfloat16 signaling_NaN() noexcept
    {
-        return nncase::bfloat16::nan();
+        return nncase::bfloat16::signaling_NaN();
    }

    static constexpr int digits = 8;
--- a/lib/nncase/v1/include/nncase/runtime/compiler_defs.h
+++ b/lib/nncase/v1/include/nncase/runtime/compiler_defs.h
@ -25,7 +25,7 @@
 #define NNCASE_API
 #endif
 #else
-#define NNCASE_API
+#define NNCASE_API __attribute__((visibility("default")))
 #endif

 #if defined(_MSC_VER)
@ -71,16 +71,17 @@ using invoke_result_t = std::result_of_t<Callable(Args...)>;
    }                         \
    }

-#define BEGIN_NS_NNCASE_RT_STACKVM \
-    namespace nncase               \
-    {                              \
-        namespace runtime          \
-        {                          \
-            namespace stackvm      \
+#define BEGIN_NS_NNCASE_RT_MODULE(MODULE) \
+    namespace nncase                      \
+    {                                     \
+        namespace runtime                 \
+        {                                 \
+            namespace MODULE              \
            {
-#define END_NS_NNCASE_RT_STACKVM \
-    }                            \
-    }                            \
+
+#define END_NS_NNCASE_RT_MODULE \
+    }                           \
+    }                           \
    }

 #define BEGIN_NS_NNCASE_KERNELS \
--- a/lib/nncase/v1/include/nncase/runtime/datatypes.h
+++ b/lib/nncase/v1/include/nncase/runtime/datatypes.h
@ -15,6 +15,7 @@
 #pragma once
 #include "bfloat16.h"
 #include "compiler_defs.h"
+#include "half.h"
 #include "small_vector.hpp"
 #include <array>
 #include <cmath>
@ -114,7 +115,7 @@ struct value_range

    static constexpr value_range<T> full() noexcept
    {
-        if (std::is_floating_point<T>::value || std::is_same<T, bfloat16>::value)
+        if (std::is_floating_point<T>::value || std::is_same<T, bfloat16>::value || std::is_same<T, half>::value)
            return { -std::numeric_limits<T>::infinity(), std::numeric_limits<T>::infinity() };
        else
            return { std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max() };
@ -324,6 +325,8 @@ NNCASE_INLINE_VAR constexpr memory_location_t mem_input = 0;
 NNCASE_INLINE_VAR constexpr memory_location_t mem_output = 1;
 NNCASE_INLINE_VAR constexpr memory_location_t mem_rdata = 2;
 NNCASE_INLINE_VAR constexpr memory_location_t mem_data = 3;
+NNCASE_INLINE_VAR constexpr memory_location_t mem_shared_data = 4;
+NNCASE_INLINE_VAR constexpr memory_location_t mem_private_base = 64;

 using runtime_shape_t = itlib::small_vector<size_t, 4>;
 using runtime_axis_t = itlib::small_vector<int32_t, 4>;
@ -378,6 +381,12 @@ struct scalar
        as<bfloat16>() = value;
    }

+    scalar(half value) noexcept
+    {
+        type = dt_float16;
+        as<half>() = value;
+    }
+
    scalar(float value) noexcept
    {
        type = dt_float32;
@ -395,7 +404,7 @@ struct memory_range
 {
    memory_location_t memory_location;
    datatype_t datatype;
-    uint16_t reserved0;
+    uint16_t shared_module;
    uint32_t start;
    uint32_t size;
 };
@ -456,3 +465,16 @@ inline bool operator!=(const scalar &lhs, const scalar &rhs) noexcept
    return lhs.type != rhs.type || memcmp(&lhs.storage, &rhs.storage, valid_bytes);
 }
 }
+
+template <>
+struct std::hash<nncase::module_type_t>
+{
+    auto operator()(const nncase::module_type_t &key) const noexcept
+    {
+        size_t result = 0;
+        const size_t prime = 31;
+        for (auto c : key)
+            result = c + (result * prime);
+        return result;
+    }
+};
--- a/lib/nncase/v1/include/nncase/runtime/dbg.h
+++ b/lib/nncase/v1/include/nncase/runtime/dbg.h
@ -172,7 +172,7 @@ struct type_tag
 {
 };

-template <int &... ExplicitArgumentBarrier, typename T>
+template <int &...ExplicitArgumentBarrier, typename T>
 std::string get_type_name(type_tag<T>)
 {
    namespace pf = pretty_function;
@ -772,7 +772,8 @@ inline bool pretty_print(std::ostream &stream,
    const std::variant<Ts...> &value)
 {
    stream << "{";
-    std::visit([&stream](auto &&arg) { pretty_print(stream, arg); }, value);
+    std::visit([&stream](auto &&arg) { pretty_print(stream, arg); },
+        value);
    stream << "}";

    return true;
@ -849,7 +850,7 @@ public:
    template <typename... T>
    auto print(std::initializer_list<expr_t> exprs,
        std::initializer_list<std::string> types,
-        T &&... values) -> last_t<T...>
+        T &&...values) -> last_t<T...>
    {
        if (exprs.size() != sizeof...(values))
        {
@ -861,6 +862,15 @@ public:
        return print_impl(exprs.begin(), types.begin(), std::forward<T>(values)...);
    }

+    template <typename T>
+    void print_err(T &&message)
+    {
+        std::cerr
+            << m_location << ansi(ANSI_WARN)
+            << message
+            << ansi(ANSI_RESET) << std::endl;
+    }
+
    template <typename T>
    T &&checked_print(std::initializer_list<expr_t> exprs,
        std::initializer_list<std::string> types,
@ -912,7 +922,7 @@ private:
    auto print_impl(const expr_t *exprs,
        const std::string *types,
        T &&value,
-        U &&... rest) -> last_t<T, U...>
+        U &&...rest) -> last_t<T, U...>
    {
        print_impl(exprs, types, std::forward<T>(value));
        return print_impl(exprs + 1, types + 1, std::forward<U>(rest)...);
@ -954,7 +964,7 @@ T &&identity(T &&t)
 }

 template <typename T, typename... U>
-auto identity(T &&, U &&... u) -> last_t<U...>
+auto identity(T &&, U &&...u) -> last_t<U...>
 {
    return identity(std::forward<U>(u)...);
 }
@ -1026,6 +1036,48 @@ auto identity(T &&, U &&... u) -> last_t<U...>
    if (!CHECK(x))           \
    return nncase::err(e)

+#define checked_try(x)                                     \
+    {                                                      \
+        auto v = (x);                                      \
+        if (!v.is_ok())                                    \
+        {                                                  \
+            dbg::DebugOutput(__FILE__, __LINE__, __func__) \
+                .print_err(v.unwrap_err().message());      \
+            return nncase::err(std::move(v.unwrap_err())); \
+        }                                                  \
+    }
+
+#define checked_try_var(name, x)                           \
+    typename decltype((x))::traits::ok_type name;          \
+    {                                                      \
+        auto v = (x);                                      \
+        if (v.is_ok())                                     \
+        {                                                  \
+            name = std::move(v.unwrap());                  \
+        }                                                  \
+        else                                               \
+        {                                                  \
+            dbg::DebugOutput(__FILE__, __LINE__, __func__) \
+                .print_err(v.unwrap_err().message());      \
+            return nncase::err(std::move(v.unwrap_err())); \
+        }                                                  \
+    }
+
+#define checked_try_set(name, x)                           \
+    {                                                      \
+        auto v = (x);                                      \
+        if (v.is_ok())                                     \
+        {                                                  \
+            name = std::move(v.unwrap());                  \
+        }                                                  \
+        else                                               \
+        {                                                  \
+            dbg::DebugOutput(__FILE__, __LINE__, __func__) \
+                .print_err(v.unwrap_err().message());      \
+            return nncase::err(std::move(v.unwrap_err())); \
+        }                                                  \
+    }
+
 #define dbg(...)                                        \
    dbg::DebugOutput(__FILE__, __LINE__, __func__)      \
        .print({ DBG_MAP(DBG_STRINGIFY, __VA_ARGS__) }, \
--- a/lib/nncase/v1/include/nncase/runtime/half.h
+++ b/lib/nncase/v1/include/nncase/runtime/half.h
@ -0,0 +1,369 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cmath>
+#include <cstdint>
+#include <float.h>
+#include <functional>
+#include <limits>
+#include <nncase/runtime/compiler_defs.h>
+
+namespace nncase
+{
+struct fp16_from_raw_t
+{
+    explicit fp16_from_raw_t() = default;
+};
+
+NNCASE_INLINE_VAR constexpr fp16_from_raw_t fp16_from_raw {};
+
+struct half
+{
+private:
+    union fp32
+    {
+        uint32_t u32;
+        float f32;
+
+        uint16_t u16() const noexcept
+        {
+            constexpr size_t index = NNCASE_LITTLE_ENDIAN ? 1 : 0;
+            return reinterpret_cast<const uint16_t *>(&u32)[index];
+        }
+
+        uint16_t &u16() noexcept
+        {
+            constexpr size_t index = NNCASE_LITTLE_ENDIAN ? 1 : 0;
+            return reinterpret_cast<uint16_t *>(&u32)[index];
+        }
+    };
+
+    static constexpr uint16_t ZERO_VALUE = 0;
+
+    // this is quiet NaN, sNaN only used for send signal
+    static constexpr uint16_t NAN_VALUE = 0x7e00;
+
+public:
+    half() noexcept = default;
+
+    explicit half(float v) noexcept
+        : value_(round_to_half(v).value_) { }
+
+    template <class T, class = std::enable_if_t<std::is_integral<T>::value || std::is_floating_point<T>::value>>
+    explicit half(const T &val) noexcept
+        : half(static_cast<float>(val)) { }
+
+    constexpr half(fp16_from_raw_t, uint16_t value) noexcept
+        : value_(value) { }
+
+    operator float() const noexcept
+    {
+        const fp32 magic = { 113 << 23 };
+        const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
+        fp32 o;
+
+        o.u32 = (value_ & 0x7fff) << 13; // exponent/mantissa bits
+        unsigned int exp = shifted_exp & o.u32; // just the exponent
+        o.u32 += (127 - 15) << 23; // exponent adjust
+
+        // handle exponent special cases
+        if (exp == shifted_exp)
+        { // Inf/NaN?
+            o.u32 += (128 - 16) << 23; // extra exp adjust
+        }
+        else if (exp == 0)
+        { // Zero/Denormal?
+            o.u32 += 1 << 23; // extra exp adjust
+            o.f32 -= magic.f32; // renormalize
+        }
+
+        o.u32 |= (value_ & 0x8000) << 16; // sign bit
+        return o.f32;
+    }
+
+    const uint16_t &raw() const noexcept { return value_; }
+    uint16_t &raw() noexcept { return value_; }
+
+    static constexpr half from_raw(uint16_t v) noexcept
+    {
+        return half(nncase::fp16_from_raw, v);
+    }
+
+    static half round_to_half(float v)
+    {
+        fp32 f;
+        f.f32 = v;
+        const fp32 f32infy = { 255 << 23 };
+        const fp32 f16max = { (127 + 16) << 23 };
+        const fp32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
+        unsigned int sign_mask = 0x80000000u;
+
+        unsigned int sign = f.u32 & sign_mask;
+        f.u32 ^= sign;
+
+        // NOTE all the integer compares in this function can be safely
+        // compiled into signed compares since all operands are below
+        // 0x80000000. Important if you want fast straight SSE2 code
+        // (since there's no unsigned PCMPGTD).
+        half o;
+        if (f.u32 >= f16max.u32) // result is Inf or NaN (all exponent bits set)
+        {
+            o.value_ = (f.u32 > f32infy.u32) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
+        }
+        else
+        {
+            if (f.u32 < (113 << 23))
+            { // resulting FP16 is subnormal or zero
+                // use a magic value to align our 10 mantissa bits at the bottom of
+                // the float. as long as FP addition is round-to-nearest-even this
+                // just works.
+                f.f32 += denorm_magic.f32;
+
+                // and one integer subtract of the bias later, we have our final float!
+                o.value_ = static_cast<uint16_t>(f.u32 - denorm_magic.u32);
+            }
+            else
+            {
+                unsigned int mant_odd = (f.u32 >> 13) & 1; // resulting mantissa is odd
+
+                // update exponent, rounding bias part 1
+                // Equivalent to `f.u32 += ((unsigned int)(15 - 127) << 23) + 0xfff`, but
+                // without arithmetic overflow.
+                f.u32 += 0xc8000fffU;
+                // rounding bias part 2
+                f.u32 += mant_odd;
+                // take the bits!
+                o.value_ = static_cast<uint16_t>(f.u32 >> 13);
+            }
+        }
+        o.value_ |= static_cast<uint16_t>(sign >> 16);
+        return o;
+    }
+
+    static constexpr half epsilon() noexcept
+    {
+        return from_raw(0x0800);
+    }
+
+    static constexpr half highest() noexcept
+    {
+        return from_raw(0x7bff);
+    }
+
+    static constexpr half min() noexcept
+    {
+        return from_raw(0x0400);
+    }
+
+    static constexpr half lowest() noexcept
+    {
+        return from_raw(0xfbff);
+    }
+
+    static constexpr half quiet_NaN() noexcept
+    {
+        return from_raw(0x7e00);
+    }
+
+    static constexpr half signaling_NaN() noexcept
+    {
+        return from_raw(0x7d00);
+    }
+
+    static constexpr half infinity() noexcept
+    {
+        return from_raw(0x7c00);
+    }
+
+    constexpr bool zero() const noexcept { return (value_ & 0x7FFF) == ZERO_VALUE; }
+
+private:
+    uint16_t value_;
+};
+
+#define DEFINE_FP16_BINARY_FP16RET(x)                    \
+    inline half operator x(half a, half b) noexcept      \
+    {                                                    \
+        return half::round_to_half(float(a) x float(b)); \
+    }
+
+#define DEFINE_FP16_BINARY_BOOLRET(x)               \
+    inline bool operator x(half a, half b) noexcept \
+    {                                               \
+        return float(a) x float(b);                 \
+    }
+
+DEFINE_FP16_BINARY_FP16RET(+)
+DEFINE_FP16_BINARY_FP16RET(-)
+DEFINE_FP16_BINARY_FP16RET(*)
+DEFINE_FP16_BINARY_FP16RET(/)
+DEFINE_FP16_BINARY_BOOLRET(<)
+DEFINE_FP16_BINARY_BOOLRET(<=)
+DEFINE_FP16_BINARY_BOOLRET(>=)
+DEFINE_FP16_BINARY_BOOLRET(>)
+
+#define DEFINE_FP16_BINARY_SELF_MOD(x, op)            \
+    inline half &operator x(half &a, half b) noexcept \
+    {                                                 \
+        a = a op b;                                   \
+        return a;                                     \
+    }
+
+DEFINE_FP16_BINARY_SELF_MOD(+=, +)
+DEFINE_FP16_BINARY_SELF_MOD(-=, -)
+DEFINE_FP16_BINARY_SELF_MOD(*=, *)
+DEFINE_FP16_BINARY_SELF_MOD(/=, /)
+
+inline half operator-(half a) noexcept
+{
+    return half::round_to_half(-float(a));
+}
+
+inline bool operator==(const half &lhs, const half &rhs) noexcept
+{
+    return lhs.raw() == rhs.raw();
+}
+
+inline bool operator!=(const half &lhs, const half &rhs) noexcept
+{
+    return lhs.raw() != rhs.raw();
+}
+}
+
+namespace std
+{
+template <>
+struct hash<nncase::half>
+{
+    size_t operator()(const nncase::half &v) const
+    {
+        return hash<float>()(static_cast<float>(v));
+    }
+};
+
+template <>
+struct numeric_limits<nncase::half>
+{
+    static constexpr float_denorm_style has_denorm = std::denorm_present;
+    static constexpr bool has_infinity = true;
+    static constexpr bool has_quiet_NaN = true;
+    static constexpr bool has_signaling_NaN = true;
+    static constexpr bool is_bounded = false;
+    static constexpr bool is_iec559 = true;
+    static constexpr bool is_signed = true;
+    static constexpr bool is_specialized = true;
+    static constexpr float_round_style round_style = std::round_to_nearest;
+    static constexpr int radix = FLT_RADIX;
+
+    NNCASE_UNUSED static constexpr nncase::half(min)() noexcept
+    {
+        return nncase::half::min();
+    }
+
+    NNCASE_UNUSED static constexpr nncase::half(max)() noexcept
+    {
+        return nncase::half::highest();
+    }
+
+    NNCASE_UNUSED static constexpr nncase::half lowest() noexcept
+    {
+        return nncase::half::lowest();
+    }
+
+    NNCASE_UNUSED static constexpr nncase::half epsilon() noexcept
+    {
+        return nncase::half::epsilon();
+    }
+
+    NNCASE_UNUSED static nncase::half round_error() noexcept
+    {
+        return nncase::half((double)0.5);
+    }
+
+    NNCASE_UNUSED static constexpr nncase::half denorm_min() noexcept
+    {
+        return nncase::half::min();
+    }
+
+    NNCASE_UNUSED static constexpr nncase::half infinity() noexcept
+    {
+        return nncase::half::infinity();
+    }
+
+    NNCASE_UNUSED static constexpr nncase::half quiet_NaN() noexcept
+    {
+        return nncase::half::quiet_NaN();
+    }
+
+    NNCASE_UNUSED static constexpr nncase::half signaling_NaN() noexcept
+    {
+        return nncase::half::signaling_NaN();
+    }
+
+    static constexpr int digits = 11;
+    static const int min_exponent = -13;
+    static const int min_exponent10 = -4;
+    static const int max_exponent = 16;
+    static const int max_exponent10 = 4;
+};
+
+using nncase::half;
+inline bool isinf(const half &a) { return std::isinf(float(a)); }
+inline bool isnan(const half &a) { return std::isnan(float(a)); }
+inline bool isfinite(const half &a) { return std::isfinite(float(a)); }
+inline half abs(const half &a) { return half::round_to_half(fabsf(float(a))); }
+inline half exp(const half &a) { return half::round_to_half(expf(float(a))); }
+inline half log(const half &a) { return half::round_to_half(logf(float(a))); }
+inline half log10(const half &a)
+{
+    return half::round_to_half(log10f(float(a)));
+}
+inline half sqrt(const half &a)
+{
+    return half::round_to_half(sqrtf(float(a)));
+}
+inline half pow(const half &a, const half &b)
+{
+    return half::round_to_half(powf(float(a), float(b)));
+}
+
+inline half sin(const half &a) { return half::round_to_half(sinf(float(a))); }
+inline half cos(const half &a) { return half::round_to_half(cosf(float(a))); }
+inline half tan(const half &a) { return half::round_to_half(tanf(float(a))); }
+inline half tanh(const half &a)
+{
+    return half::round_to_half(tanhf(float(a)));
+}
+inline half floor(const half &a)
+{
+    return half::round_to_half(floorf(float(a)));
+}
+inline half ceil(const half &a)
+{
+    return half::round_to_half(ceilf(float(a)));
+}
+inline half round(const half &a)
+{
+    return half::round_to_half(roundf(float(a)));
+}
+inline half nearbyint(const half &a)
+{
+    return half::round_to_half(nearbyintf(float(a)));
+}
+inline long lrint(const half &a)
+{
+    return lrintf(float(a));
+}
+}
--- a/lib/nncase/v1/include/nncase/runtime/incbin.h
+++ b/lib/nncase/v1/include/nncase/runtime/incbin.h
@ -0,0 +1,369 @@
+/**
+ * @file incbin.h
+ * @author Dale Weiler
+ * @brief Utility for including binary files
+ *
+ * Facilities for including binary files into the current translation unit and
+ * making use from them externally in other translation units.
+ */
+// clang-format off
+#ifndef INCBIN_HDR
+#define INCBIN_HDR
+#include <limits.h>
+#if   defined(__AVX512BW__) || \
+      defined(__AVX512CD__) || \
+      defined(__AVX512DQ__) || \
+      defined(__AVX512ER__) || \
+      defined(__AVX512PF__) || \
+      defined(__AVX512VL__) || \
+      defined(__AVX512F__)
+# define INCBIN_ALIGNMENT_INDEX 6
+#elif defined(__AVX__)      || \
+      defined(__AVX2__)
+# define INCBIN_ALIGNMENT_INDEX 5
+#elif defined(__SSE__)      || \
+      defined(__SSE2__)     || \
+      defined(__SSE3__)     || \
+      defined(__SSSE3__)    || \
+      defined(__SSE4_1__)   || \
+      defined(__SSE4_2__)   || \
+      defined(__neon__)
+# define INCBIN_ALIGNMENT_INDEX 4
+#elif ULONG_MAX != 0xffffffffu
+# define INCBIN_ALIGNMENT_INDEX 3
+# else
+# define INCBIN_ALIGNMENT_INDEX 2
+#endif
+
+/* Lookup table of (1 << n) where `n' is `INCBIN_ALIGNMENT_INDEX' */
+#define INCBIN_ALIGN_SHIFT_0 1
+#define INCBIN_ALIGN_SHIFT_1 2
+#define INCBIN_ALIGN_SHIFT_2 4
+#define INCBIN_ALIGN_SHIFT_3 8
+#define INCBIN_ALIGN_SHIFT_4 16
+#define INCBIN_ALIGN_SHIFT_5 32
+#define INCBIN_ALIGN_SHIFT_6 64
+
+/* Actual alignment value */
+#define INCBIN_ALIGNMENT \
+    INCBIN_CONCATENATE( \
+        INCBIN_CONCATENATE(INCBIN_ALIGN_SHIFT, _), \
+        INCBIN_ALIGNMENT_INDEX)
+
+/* Stringize */
+#define INCBIN_STR(X) \
+    #X
+#define INCBIN_STRINGIZE(X) \
+    INCBIN_STR(X)
+/* Concatenate */
+#define INCBIN_CAT(X, Y) \
+    X ## Y
+#define INCBIN_CONCATENATE(X, Y) \
+    INCBIN_CAT(X, Y)
+/* Deferred macro expansion */
+#define INCBIN_EVAL(X) \
+    X
+#define INCBIN_INVOKE(N, ...) \
+    INCBIN_EVAL(N(__VA_ARGS__))
+
+/* Green Hills uses a different directive for including binary data */
+#if defined(__ghs__)
+#  if (__ghs_asm == 2)
+#    define INCBIN_MACRO ".file"
+/* Or consider the ".myrawdata" entry in the ld file */
+#  else
+#    define INCBIN_MACRO "\tINCBIN"
+#  endif
+#else
+#  define INCBIN_MACRO ".incbin"
+#endif
+
+#ifndef _MSC_VER
+#  define INCBIN_ALIGN \
+    __attribute__((aligned(INCBIN_ALIGNMENT)))
+#else
+#  define INCBIN_ALIGN __declspec(align(INCBIN_ALIGNMENT))
+#endif
+
+#if defined(__arm__) || /* GNU C and RealView */ \
+    defined(__arm) || /* Diab */ \
+    defined(_ARM) /* ImageCraft */
+#  define INCBIN_ARM
+#endif
+
+#ifdef __GNUC__
+/* Utilize .balign where supported */
+#  define INCBIN_ALIGN_HOST ".balign " INCBIN_STRINGIZE(INCBIN_ALIGNMENT) "\n"
+#  define INCBIN_ALIGN_BYTE ".balign 1\n"
+#elif defined(INCBIN_ARM)
+/*
+ * On arm assemblers, the alignment value is calculated as (1 << n) where `n' is
+ * the shift count. This is the value passed to `.align'
+ */
+#  define INCBIN_ALIGN_HOST ".align " INCBIN_STRINGIZE(INCBIN_ALIGNMENT_INDEX) "\n"
+#  define INCBIN_ALIGN_BYTE ".align 0\n"
+#else
+/* We assume other inline assembler's treat `.align' as `.balign' */
+#  define INCBIN_ALIGN_HOST ".align " INCBIN_STRINGIZE(INCBIN_ALIGNMENT) "\n"
+#  define INCBIN_ALIGN_BYTE ".align 1\n"
+#endif
+
+/* INCBIN_CONST is used by incbin.c generated files */
+#if defined(__cplusplus)
+#  define INCBIN_EXTERNAL extern "C"
+#  define INCBIN_CONST    extern const
+#else
+#  define INCBIN_EXTERNAL extern
+#  define INCBIN_CONST    const
+#endif
+
+/**
+ * @brief Optionally override the linker section into which data is emitted.
+ *
+ * @warning If you use this facility, you'll have to deal with platform-specific linker output
+ * section naming on your own
+ *
+ * Overriding the default linker output section, e.g for esp8266/Arduino:
+ * @code
+ * #define INCBIN_OUTPUT_SECTION ".irom.text"
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ * // Data is emitted into program memory that never gets copied to RAM
+ * @endcode
+ */
+#if !defined(INCBIN_OUTPUT_SECTION)
+#  if defined(__APPLE__)
+#    define INCBIN_OUTPUT_SECTION         ".const_data"
+#  else
+#    define INCBIN_OUTPUT_SECTION         ".rodata"
+#  endif
+#endif
+
+#if defined(__APPLE__)
+/* The directives are different for Apple branded compilers */
+#  define INCBIN_SECTION         INCBIN_OUTPUT_SECTION "\n"
+#  define INCBIN_GLOBAL(NAME)    ".globl " INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n"
+#  define INCBIN_INT             ".long "
+#  define INCBIN_MANGLE          "_"
+#  define INCBIN_BYTE            ".byte "
+#  define INCBIN_TYPE(...)
+#else
+#  define INCBIN_SECTION         ".section " INCBIN_OUTPUT_SECTION "\n"
+#  define INCBIN_GLOBAL(NAME)    ".global " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n"
+#  if defined(__ghs__)
+#    define INCBIN_INT           ".word "
+#  else
+#    define INCBIN_INT           ".int "
+#  endif
+#  if defined(__USER_LABEL_PREFIX__)
+#    define INCBIN_MANGLE        INCBIN_STRINGIZE(__USER_LABEL_PREFIX__)
+#  else
+#    define INCBIN_MANGLE        ""
+#  endif
+#  if defined(INCBIN_ARM)
+/* On arm assemblers, `@' is used as a line comment token */
+#    define INCBIN_TYPE(NAME)    ".type " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME ", %object\n"
+#  elif defined(__MINGW32__) || defined(__MINGW64__)
+/* Mingw doesn't support this directive either */
+#    define INCBIN_TYPE(NAME)
+#  else
+/* It's safe to use `@' on other architectures */
+#    define INCBIN_TYPE(NAME)    ".type " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME ", @object\n"
+#  endif
+#  define INCBIN_BYTE            ".byte "
+#endif
+
+/* List of style types used for symbol names */
+#define INCBIN_STYLE_CAMEL 0
+#define INCBIN_STYLE_SNAKE 1
+
+/**
+ * @brief Specify the prefix to use for symbol names.
+ *
+ * By default this is `g', producing symbols of the form:
+ * @code
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char gFooData[];
+ * // const unsigned char *const gFooEnd;
+ * // const unsigned int gFooSize;
+ * @endcode
+ *
+ * If however you specify a prefix before including: e.g:
+ * @code
+ * #define INCBIN_PREFIX incbin
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ *
+ * // Now you have the following symbols instead:
+ * // const unsigned char incbinFooData[];
+ * // const unsigned char *const incbinFooEnd;
+ * // const unsigned int incbinFooSize;
+ * @endcode
+ */
+#if !defined(INCBIN_PREFIX)
+#  define INCBIN_PREFIX g
+#endif
+
+/**
+ * @brief Specify the style used for symbol names.
+ *
+ * Possible options are
+ * - INCBIN_STYLE_CAMEL "CamelCase"
+ * - INCBIN_STYLE_SNAKE "snake_case"
+ *
+ * Default option is *INCBIN_STYLE_CAMEL* producing symbols of the form:
+ * @code
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char <prefix>FooData[];
+ * // const unsigned char *const <prefix>FooEnd;
+ * // const unsigned int <prefix>FooSize;
+ * @endcode
+ *
+ * If however you specify a style before including: e.g:
+ * @code
+ * #define INCBIN_STYLE INCBIN_STYLE_SNAKE
+ * #include "incbin.h"
+ * INCBIN(foo, "foo.txt");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char <prefix>foo_data[];
+ * // const unsigned char *const <prefix>foo_end;
+ * // const unsigned int <prefix>foo_size;
+ * @endcode
+ */
+#if !defined(INCBIN_STYLE)
+#  define INCBIN_STYLE INCBIN_STYLE_CAMEL
+#endif
+
+/* Style lookup tables */
+#define INCBIN_STYLE_0_DATA Data
+#define INCBIN_STYLE_0_END End
+#define INCBIN_STYLE_0_SIZE Size
+#define INCBIN_STYLE_1_DATA _data
+#define INCBIN_STYLE_1_END _end
+#define INCBIN_STYLE_1_SIZE _size
+
+/* Style lookup: returning identifier */
+#define INCBIN_STYLE_IDENT(TYPE) \
+    INCBIN_CONCATENATE( \
+        INCBIN_STYLE_, \
+        INCBIN_CONCATENATE( \
+            INCBIN_EVAL(INCBIN_STYLE), \
+            INCBIN_CONCATENATE(_, TYPE)))
+
+/* Style lookup: returning string literal */
+#define INCBIN_STYLE_STRING(TYPE) \
+    INCBIN_STRINGIZE( \
+        INCBIN_STYLE_IDENT(TYPE)) \
+
+/* Generate the global labels by indirectly invoking the macro with our style
+ * type and concatenating the name against them. */
+#define INCBIN_GLOBAL_LABELS(NAME, TYPE) \
+    INCBIN_INVOKE( \
+        INCBIN_GLOBAL, \
+        INCBIN_CONCATENATE( \
+            NAME, \
+            INCBIN_INVOKE( \
+                INCBIN_STYLE_IDENT, \
+                TYPE))) \
+    INCBIN_INVOKE( \
+        INCBIN_TYPE, \
+        INCBIN_CONCATENATE( \
+            NAME, \
+            INCBIN_INVOKE( \
+                INCBIN_STYLE_IDENT, \
+                TYPE)))
+
+/**
+ * @brief Externally reference binary data included in another translation unit.
+ *
+ * Produces three external symbols that reference the binary data included in
+ * another translation unit.
+ *
+ * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with
+ * "Data", as well as "End" and "Size" after. An example is provided below.
+ *
+ * @param NAME The name given for the binary data
+ *
+ * @code
+ * INCBIN_EXTERN(Foo);
+ *
+ * // Now you have the following symbols:
+ * // extern const unsigned char <prefix>FooData[];
+ * // extern const unsigned char *const <prefix>FooEnd;
+ * // extern const unsigned int <prefix>FooSize;
+ * @endcode
+ */
+#define INCBIN_EXTERN(NAME) \
+    INCBIN_EXTERNAL const INCBIN_ALIGN unsigned char \
+        INCBIN_CONCATENATE( \
+            INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+            INCBIN_STYLE_IDENT(DATA))[]; \
+    INCBIN_EXTERNAL const INCBIN_ALIGN unsigned char *const \
+    INCBIN_CONCATENATE( \
+        INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+        INCBIN_STYLE_IDENT(END)); \
+    INCBIN_EXTERNAL const unsigned int \
+        INCBIN_CONCATENATE( \
+            INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+            INCBIN_STYLE_IDENT(SIZE))
+
+/**
+ * @brief Include a binary file into the current translation unit.
+ *
+ * Includes a binary file into the current translation unit, producing three symbols
+ * for objects that encode the data and size respectively.
+ *
+ * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with
+ * "Data", as well as "End" and "Size" after. An example is provided below.
+ *
+ * @param NAME The name to associate with this binary data (as an identifier.)
+ * @param FILENAME The file to include (as a string literal.)
+ *
+ * @code
+ * INCBIN(Icon, "icon.png");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char <prefix>IconData[];
+ * // const unsigned char *const <prefix>IconEnd;
+ * // const unsigned int <prefix>IconSize;
+ * @endcode
+ *
+ * @warning This must be used in global scope
+ * @warning The identifiers may be different if INCBIN_STYLE is not default
+ *
+ * To externally reference the data included by this in another translation unit
+ * please @see INCBIN_EXTERN.
+ */
+#ifdef _MSC_VER
+#define INCBIN(NAME, FILENAME) \
+    INCBIN_EXTERN(NAME)
+#else
+#define INCBIN(NAME, FILENAME) \
+    __asm__(INCBIN_SECTION \
+            INCBIN_GLOBAL_LABELS(NAME, DATA) \
+            INCBIN_ALIGN_HOST \
+            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) ":\n" \
+            INCBIN_MACRO " \"" FILENAME "\"\n" \
+            INCBIN_GLOBAL_LABELS(NAME, END) \
+            INCBIN_ALIGN_BYTE \
+            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) ":\n" \
+                INCBIN_BYTE "1\n" \
+            INCBIN_GLOBAL_LABELS(NAME, SIZE) \
+            INCBIN_ALIGN_HOST \
+            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(SIZE) ":\n" \
+                INCBIN_INT INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) " - " \
+                           INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) "\n" \
+            INCBIN_ALIGN_HOST \
+            ".text\n" \
+    ); \
+    INCBIN_EXTERN(NAME)
+
+#endif
+#endif
--- a/lib/nncase/v1/include/nncase/runtime/interpreter.h
+++ b/lib/nncase/v1/include/nncase/runtime/interpreter.h
@ -74,7 +74,7 @@ public:

 private:
    std::vector<std::unique_ptr<runtime_module>> modules_;
-    runtime_module *main_module_;
+    runtime_function *entry_function_;
    options_dict options_;
 };

--- a/lib/nncase/v1/include/nncase/runtime/k210/compiler_defs.h
+++ b/lib/nncase/v1/include/nncase/runtime/k210/compiler_defs.h
@ -22,21 +22,9 @@
 #define NNCASE_MODULES_K210_API __declspec(dllimport)
 #endif
 #else
-#define NNCASE_MODULES_K210_API
+#define NNCASE_MODULES_K210_API __attribute__((visibility("default")))
 #endif

-#define BEGIN_NS_NNCASE_RT_K210 \
-    namespace nncase            \
-    {                           \
-    namespace runtime           \
-    {                           \
-        namespace k210          \
-        {
-#define END_NS_NNCASE_RT_K210 \
-    }                         \
-    }                         \
-    }
-
 #define BEGIN_NS_NNCASE_KERNELS_K210 \
    namespace nncase                 \
    {                                \
--- a/lib/nncase/v1/include/nncase/runtime/k210/error.h
+++ b/lib/nncase/v1/include/nncase/runtime/k210/error.h
@ -16,7 +16,7 @@
 #include "compiler_defs.h"
 #include <nncase/runtime/error.h>

-BEGIN_NS_NNCASE_RT_K210
+BEGIN_NS_NNCASE_RT_MODULE(k210)

 enum class nncase_k210_errc
 {
@ -26,7 +26,7 @@ enum class nncase_k210_errc
 NNCASE_MODULES_K210_API const std::error_category &nncase_k210_category() noexcept;
 NNCASE_MODULES_K210_API std::error_condition make_error_condition(nncase_k210_errc code);

-END_NS_NNCASE_RT_K210
+END_NS_NNCASE_RT_MODULE

 namespace std
 {
--- a/lib/nncase/v1/include/nncase/runtime/k210/op_reader.h
+++ b/lib/nncase/v1/include/nncase/runtime/k210/op_reader.h
@ -17,7 +17,7 @@
 #include <nncase/runtime/result.h>
 #include <nncase/runtime/span_reader.h>

-BEGIN_NS_NNCASE_RT_K210
+BEGIN_NS_NNCASE_RT_MODULE(k210)

 class NNCASE_MODULES_K210_API op_visitor
 {
@ -44,4 +44,4 @@ private:
    result<void> next() noexcept;
 };

-END_NS_NNCASE_RT_K210
+END_NS_NNCASE_RT_MODULE
--- a/lib/nncase/v1/include/nncase/runtime/k210/runtime_module.h
+++ b/lib/nncase/v1/include/nncase/runtime/k210/runtime_module.h
@ -16,10 +16,11 @@
 #include "compiler_defs.h"
 #include <nncase/runtime/runtime_module.h>

-BEGIN_NS_NNCASE_RT_K210
+BEGIN_NS_NNCASE_RT_MODULE(k210)

 NNCASE_INLINE_VAR constexpr module_type_t k210_module_type = to_module_type("k210");
+NNCASE_INLINE_VAR constexpr uint32_t k210_module_version = 1;

 NNCASE_MODULES_K210_API result<std::unique_ptr<runtime_module>> create_k210_runtime_module();

-END_NS_NNCASE_RT_K210
+END_NS_NNCASE_RT_MODULE
--- a/lib/nncase/v1/include/nncase/runtime/k210/runtime_op_utility.h
+++ b/lib/nncase/v1/include/nncase/runtime/k210/runtime_op_utility.h
@ -15,7 +15,7 @@
 #pragma once
 #include "runtime_types.h"

-BEGIN_NS_NNCASE_RT_K210
+BEGIN_NS_NNCASE_RT_MODULE(k210)

 struct kpu_layout
 {
@ -184,4 +184,4 @@ inline std::array<int32_t, 2> get_kpu_select_pool_offset(kpu_pool_type_t pool_ty
    }
 }

-END_NS_NNCASE_RT_K210
+END_NS_NNCASE_RT_MODULE
--- a/lib/nncase/v1/include/nncase/runtime/k210/runtime_types.h
+++ b/lib/nncase/v1/include/nncase/runtime/k210/runtime_types.h
@ -16,9 +16,9 @@
 #include "compiler_defs.h"
 #include <nncase/runtime/datatypes.h>

-BEGIN_NS_NNCASE_RT_K210
+BEGIN_NS_NNCASE_RT_MODULE(k210)

-NNCASE_INLINE_VAR constexpr memory_location_t mem_kpu = 4;
+NNCASE_INLINE_VAR constexpr memory_location_t mem_kpu = mem_private_base + 0;
 NNCASE_INLINE_VAR constexpr size_t KPU_RAM_SIZE = 2 * 1024 * 1024; // 2MB

 typedef struct
@ -341,4 +341,4 @@ struct copy_options
    kpu_shape_t out_strides;
 };

-END_NS_NNCASE_RT_K210
+END_NS_NNCASE_RT_MODULE
--- a/lib/nncase/v1/include/nncase/runtime/model.h
+++ b/lib/nncase/v1/include/nncase/runtime/model.h
@ -24,26 +24,49 @@ struct model_header
 {
    uint32_t identifier;
    uint32_t version;
+    uint32_t header_size;
    uint32_t flags;
    uint32_t alignment;
    uint32_t modules;
-    uint32_t main_module;
+    uint32_t entry_module;
+    uint32_t entry_function;
+};
+
+struct function_header
+{
+    uint32_t header_size;
+    uint32_t size;
+    uint32_t input_pool_size;
+    uint32_t output_pool_size;
+    uint32_t inputs;
+    uint32_t outputs;
+    uint32_t entrypoint;
+    uint32_t text_size;
 };

 struct module_header
 {
    module_type_t type;
+    uint32_t version;
+    uint32_t header_size;
    uint32_t size;
    uint32_t mempools;
-    uint32_t inputs;
-    uint32_t outputs;
+    uint32_t shared_mempools;
    uint32_t sections;
+    uint32_t functions;
    uint32_t reserved0;
 };

 struct mempool_desc
 {
    memory_location_t location;
+    uint8_t reserved0[3];
+    uint32_t size;
+};
+
+struct shared_mempool_desc
+{
+    uint32_t module;
    uint32_t size;
 };

@ -51,8 +74,8 @@ struct section_header
 {
    char name[MAX_SECTION_NAME_LENGTH];
    uint32_t flags;
-    uint32_t start;
-    uint32_t size;
+    uint32_t body_start;
+    uint32_t body_size;
    uint32_t reserved0;
 };

--- a/lib/nncase/v1/include/nncase/runtime/result.h
+++ b/lib/nncase/v1/include/nncase/runtime/result.h
@ -72,7 +72,7 @@ struct Ok
        : value(value) { }

    template <class... Args>
-    constexpr explicit Ok(mpark::in_place_t, Args &&... args)
+    constexpr explicit Ok(mpark::in_place_t, Args &&...args)
        : value(std::forward<Args>(args)...) { }

    T value;
@ -101,7 +101,7 @@ inline constexpr Ok<void> ok()
 }

 template <class T, class... Args>
-constexpr Ok<T> ok(Args &&... args)
+constexpr Ok<T> ok(Args &&...args)
 {
    return Ok<T>(mpark::in_place, std::forward<Args>(args)...);
 }
--- a/lib/nncase/v1/include/nncase/runtime/runtime_function.h
+++ b/lib/nncase/v1/include/nncase/runtime/runtime_function.h
@ -0,0 +1,86 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "model.h"
+#include "result.h"
+#include "runtime_tensor.h"
+
+BEGIN_NS_NNCASE_RUNTIME
+
+class interpreter;
+class runtime_module;
+struct runtime_module_init_context;
+
+struct NNCASE_API runtime_function_init_context
+{
+    virtual runtime_module_init_context &module_init_context() noexcept = 0;
+    virtual const function_header &header() noexcept = 0;
+    virtual gsl::span<const gsl::byte> body() noexcept = 0;
+};
+
+class NNCASE_API runtime_function
+{
+private:
+    struct inout_tensor_info
+    {
+        runtime_shape_t shape;
+        runtime_shape_t strides;
+        memory_range range;
+        runtime_tensor bind_tensor;
+        runtime_tensor staging_tensor;
+        runtime_tensor device_tensor;
+    };
+
+public:
+    runtime_function(runtime_module &rt_module);
+    runtime_function(const runtime_function &) = delete;
+    virtual ~runtime_function() = default;
+    runtime_function &operator=(const runtime_function &) = delete;
+
+    result<void> initialize(gsl::span<const gsl::byte> payload, runtime_module_init_context &module_init_context) noexcept;
+    runtime_module &module() const noexcept;
+
+    uint32_t inputs_size() const noexcept;
+    const runtime_shape_t &input_shape(size_t index) const noexcept;
+    const memory_range &input_desc(size_t index) const noexcept;
+    result<runtime_tensor> input_tensor(size_t index) noexcept;
+    result<void> input_tensor(size_t index, runtime_tensor tensor) noexcept;
+
+    uint32_t outputs_size() const noexcept;
+    const runtime_shape_t &output_shape(size_t index) const noexcept;
+    const memory_range &output_desc(size_t index) const noexcept;
+    result<runtime_tensor> output_tensor(size_t index) noexcept;
+    result<void> output_tensor(size_t index, runtime_tensor tensor) noexcept;
+
+    result<void> invoke() noexcept;
+
+protected:
+    virtual result<void> initialize_core(runtime_function_init_context &context) noexcept = 0;
+    virtual result<runtime_tensor> allocate_input_tensor(size_t index) noexcept = 0;
+    virtual result<runtime_tensor> allocate_output_tensor(size_t index) noexcept = 0;
+    virtual result<void> validate_input_tensor(size_t index, runtime_tensor tensor) noexcept = 0;
+    virtual result<void> validate_output_tensor(size_t index, runtime_tensor tensor) noexcept = 0;
+    result<runtime_tensor> device_input_tensor(size_t index) noexcept;
+    result<runtime_tensor> device_output_tensor(size_t index) noexcept;
+    virtual result<void> invoke_core() noexcept = 0;
+
+private:
+    function_header header_;
+    std::vector<inout_tensor_info> input_tensors_;
+    std::vector<inout_tensor_info> output_tensors_;
+    runtime_module &rt_module_;
+};
+
+END_NS_NNCASE_RUNTIME
--- a/lib/nncase/v1/include/nncase/runtime/runtime_module.h
+++ b/lib/nncase/v1/include/nncase/runtime/runtime_module.h
@ -15,6 +15,7 @@
 #pragma once
 #include "model.h"
 #include "result.h"
+#include "runtime_function.h"
 #include "runtime_tensor.h"

 BEGIN_NS_NNCASE_RUNTIME
@ -31,26 +32,15 @@ struct NNCASE_API runtime_module_init_context

 class NNCASE_API runtime_module
 {
-private:
-    struct inout_tensor_info
-    {
-        runtime_shape_t shape;
-        runtime_shape_t strides;
-        memory_range range;
-        runtime_tensor bind_tensor;
-        runtime_tensor staging_tensor;
-        runtime_tensor device_tensor;
-    };
-
 public:
    static result<std::unique_ptr<runtime_module>> create(const module_type_t &type);

    runtime_module() = default;
-    runtime_module(runtime_module &) = delete;
+    runtime_module(const runtime_module &) = delete;
    virtual ~runtime_module() = default;
+    runtime_module &operator=(const runtime_module &) = delete;

-    result<void> initialize(const module_header &header, interpreter &interp) noexcept;
-    virtual result<void> initialize_inter_modules(interpreter &interp) noexcept;
+    result<void> initialize(gsl::span<const gsl::byte> payload, interpreter &interp) noexcept;
    const module_type_t &type() const noexcept;

    interpreter &interp() const noexcept { return *interp_; }
@ -59,35 +49,20 @@ public:
    const mempool_desc &mempool(size_t index) const noexcept;
    mempool_desc mempool(memory_location_t location) const noexcept;

-    uint32_t inputs_size() const noexcept;
-    const runtime_shape_t &input_shape(size_t index) const noexcept;
-    const memory_range &input_desc(size_t index) const noexcept;
-    result<runtime_tensor> input_tensor(size_t index) noexcept;
-    result<void> input_tensor(size_t index, runtime_tensor tensor) noexcept;
-
-    uint32_t outputs_size() const noexcept;
-    const runtime_shape_t &output_shape(size_t index) const noexcept;
-    const memory_range &output_desc(size_t index) const noexcept;
-    result<runtime_tensor> output_tensor(size_t index) noexcept;
-    result<void> output_tensor(size_t index, runtime_tensor tensor) noexcept;
-
-    result<void> run() noexcept;
+    result<runtime_function *> find_function_by_id(size_t index) noexcept;

 protected:
-    virtual result<void> initialize_core(runtime_module_init_context &context) noexcept = 0;
-    virtual result<runtime_tensor> allocate_input_tensor(size_t index) noexcept = 0;
-    virtual result<runtime_tensor> allocate_output_tensor(size_t index) noexcept = 0;
-    virtual result<void> validate_input_tensor(size_t index, runtime_tensor tensor) noexcept = 0;
-    virtual result<void> validate_output_tensor(size_t index, runtime_tensor tensor) noexcept = 0;
-    result<runtime_tensor> device_input_tensor(size_t index) noexcept;
-    result<runtime_tensor> device_output_tensor(size_t index) noexcept;
-    virtual result<void> run_core() noexcept = 0;
+    virtual result<void> initialize_before_functions(runtime_module_init_context &context) noexcept;
+    virtual result<void> initialize_after_functions(runtime_module_init_context &context) noexcept;
+    virtual result<std::unique_ptr<runtime_function>> create_function() noexcept = 0;
+
+    gsl::span<std::unique_ptr<runtime_function>> functions() noexcept { return functions_; }

 private:
    module_header header_;
    std::vector<mempool_desc> mempools_;
-    std::vector<inout_tensor_info> input_tensors_;
-    std::vector<inout_tensor_info> output_tensors_;
+    std::vector<mempool_desc> shared_mempools_;
+    std::vector<std::unique_ptr<runtime_function>> functions_;
    interpreter *interp_ = nullptr;
 };

--- a/lib/nncase/v1/include/nncase/runtime/runtime_op_utility.h
+++ b/lib/nncase/v1/include/nncase/runtime/runtime_op_utility.h
@ -23,17 +23,20 @@ inline constexpr size_t get_bytes(datatype_t type)
    return nncase::detail::datatype_bytes(type);
 }

-inline size_t compute_size(const runtime_shape_t &shape)
+template <class TShape>
+inline size_t compute_size(const TShape &shape)
 {
    return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
 }

-inline size_t get_bytes(datatype_t type, const runtime_shape_t &shape)
+template <class TShape>
+inline size_t get_bytes(datatype_t type, const TShape &shape)
 {
    return compute_size(shape) * get_bytes(type);
 }

-inline size_t compute_size(const runtime_shape_t &shape, const runtime_shape_t &strides)
+template <class TShape>
+inline size_t compute_size(const TShape &shape, const TShape &strides)
 {
    size_t max_stride = 0, max_shape = 0;
    for (size_t i = 0; i < shape.size(); i++)
@ -48,7 +51,8 @@ inline size_t compute_size(const runtime_shape_t &shape, const runtime_shape_t &
    return size ? size : 1;
 }

-inline size_t get_bytes(datatype_t type, const runtime_shape_t &shape, const runtime_shape_t &strides)
+template <class TShape>
+inline size_t get_bytes(datatype_t type, const TShape &shape, const TShape &strides)
 {
    return compute_size(shape, strides) * get_bytes(type);
 }
--- a/lib/nncase/v1/include/nncase/runtime/small_vector.hpp
+++ b/lib/nncase/v1/include/nncase/runtime/small_vector.hpp
@ -690,7 +690,7 @@ public:
    }

    template <typename... Args>
-    iterator emplace(const_iterator position, Args &&... args)
+    iterator emplace(const_iterator position, Args &&...args)
    {
        auto pos = grow_at(position, 1);
        atraits::construct(get_alloc(), pos, std::forward<Args>(args)...);
@ -721,7 +721,7 @@ public:
    }

    template <typename... Args>
-    reference emplace_back(Args &&... args)
+    reference emplace_back(Args &&...args)
    {
        auto pos = grow_at(m_end, 1);
        atraits::construct(get_alloc(), pos, std::forward<Args>(args)...);
--- a/lib/nncase/v1/include/nncase/runtime/span_reader.h
+++ b/lib/nncase/v1/include/nncase/runtime/span_reader.h
@ -88,18 +88,16 @@ public:
    }

    template <class T>
-    T peek()
+    T peek_with_offset(size_t offset)
    {
-        auto value = *reinterpret_cast<const T *>(span_.data());
+        auto value = *reinterpret_cast<const T *>(span_.data() + offset);
        return value;
    }

    template <class T>
-    T peek_unaligned()
+    T peek()
    {
-        T value;
-        std::memcpy(&value, span_.data(), sizeof(T));
-        return value;
+        return peek_with_offset<T>(0);
    }

    template <class T>
@ -110,6 +108,12 @@ public:
        return value;
    }

+    template <class T>
+    T peek_unaligned()
+    {
+        return peek_unaligned_with_offset<T>(0);
+    }
+
    template <class T>
    const T *get_ref()
    {
--- a/lib/nncase/v1/include/nncase/runtime/stackvm/op_reader.h
+++ b/lib/nncase/v1/include/nncase/runtime/stackvm/op_reader.h
@ -1,4 +1,4 @@
-/* This file is generated by tools/stackvm_gen/IsaGen at 2021/7/14 19:17:48 +08:00.
+/* This file is generated by tools/stackvm_gen/IsaGen at 2021/8/11 17:40:11 +08:00.
 *
 * Copyright 2019-2021 Canaan Inc.
 *
@ -20,7 +20,7 @@
 #include "../span_reader.h"
 #include "opcode.h"

-BEGIN_NS_NNCASE_RT_STACKVM
+BEGIN_NS_NNCASE_RT_MODULE(stackvm)

 template <class TOp>
 struct op_reader;
@ -1141,7 +1141,8 @@ struct op_reader<tensor_call_op_t>
        tensor_call_op_t op(default_init);
        op.opcode = static_cast<opcode_t>(reader.read_unaligned<uint8_t>());
        op.funct = static_cast<tensor_function_t>(reader.read_unaligned<uint16_t>());
-        op.module_id = reader.read_unaligned<uint32_t>();
+        op.function_id = reader.read_unaligned<uint32_t>();
+        op.module_id = reader.read_unaligned<uint16_t>();
        op.num_src = reader.read_unaligned<uint8_t>();
        op.num_dst = reader.read_unaligned<uint8_t>();
        return op;
@ -1583,4 +1584,4 @@ private:
    result<void> next() noexcept;
 };

-END_NS_NNCASE_RT_STACKVM
+END_NS_NNCASE_RT_MODULE
--- a/lib/nncase/v1/include/nncase/runtime/stackvm/opcode.h
+++ b/lib/nncase/v1/include/nncase/runtime/stackvm/opcode.h
@ -1,4 +1,4 @@
-/* This file is generated by tools/stackvm_gen/IsaGen at 2021/7/14 19:17:48 +08:00.
+/* This file is generated by tools/stackvm_gen/IsaGen at 2021/8/11 17:40:11 +08:00.
 *
 * Copyright 2019-2021 Canaan Inc.
 *
@ -17,7 +17,7 @@
 #pragma once
 #include "../datatypes.h"

-BEGIN_NS_NNCASE_RT_STACKVM
+BEGIN_NS_NNCASE_RT_MODULE(stackvm)

 // Enums

@ -1265,13 +1265,14 @@ struct tensor_call_op_t
 {
    opcode_t opcode;
    tensor_function_t funct;
-    uint32_t module_id;
+    uint32_t function_id;
+    uint16_t module_id;
    uint8_t num_src;
    uint8_t num_dst;

    tensor_call_op_t(default_init_t) noexcept { }
-    explicit tensor_call_op_t(uint32_t module_id, uint8_t num_src, uint8_t num_dst) noexcept
-        : opcode(opcode_t::TENSOR), funct(tensor_function_t::CALL), module_id(module_id), num_src(num_src), num_dst(num_dst)
+    explicit tensor_call_op_t(uint32_t function_id, uint16_t module_id, uint8_t num_src, uint8_t num_dst) noexcept
+        : opcode(opcode_t::TENSOR), funct(tensor_function_t::CALL), function_id(function_id), module_id(module_id), num_src(num_src), num_dst(num_dst)
    {
    }
 };
@ -1576,4 +1577,4 @@ struct tensor_transpose_op_t
    }
 };

-END_NS_NNCASE_RT_STACKVM
+END_NS_NNCASE_RT_MODULE
--- a/lib/nncase/v1/include/nncase/runtime/stackvm/runtime_module.h
+++ b/lib/nncase/v1/include/nncase/runtime/stackvm/runtime_module.h
@ -15,10 +15,11 @@
 #pragma once
 #include "../runtime_module.h"

-BEGIN_NS_NNCASE_RT_STACKVM
+BEGIN_NS_NNCASE_RT_MODULE(stackvm)

 NNCASE_INLINE_VAR constexpr module_type_t stackvm_module_type = to_module_type("stackvm");
+NNCASE_INLINE_VAR constexpr uint32_t stackvm_module_version = 1;

 NNCASE_API result<std::unique_ptr<runtime_module>> create_stackvm_runtime_module();

-END_NS_NNCASE_RT_STACKVM
+END_NS_NNCASE_RT_MODULE
--- a/lib/nncase/v1/include/nncase/version.h
+++ b/lib/nncase/v1/include/nncase/version.h
@ -14,4 +14,4 @@
 */
 #pragma once
 #define NNCASE_VERSION "1.0.0"
-#define NNCASE_VERSION_SUFFIX "-8c384a4"
+#define NNCASE_VERSION_SUFFIX "-9fd39f9"
--- a/lib/nncase/v1/lib/cmake/nncaseruntime/nncaseruntimeConfig.cmake
+++ b/lib/nncase/v1/lib/cmake/nncaseruntime/nncaseruntimeConfig.cmake
@ -6,5 +6,3 @@ endif()
 if(NOT TARGET gsl-lite)
    find_package(gsl-lite REQUIRED)
 endif()
-
-
--- a/lib/nncase/v1/lib/cmake/nncaseruntime/nncaseruntimeTargets-release.cmake
+++ b/lib/nncase/v1/lib/cmake/nncaseruntime/nncaseruntimeTargets-release.cmake
@ -15,6 +15,16 @@ set_target_properties(nncaseruntime PROPERTIES
 list(APPEND _IMPORT_CHECK_TARGETS nncaseruntime )
 list(APPEND _IMPORT_CHECK_FILES_FOR_nncaseruntime "${_IMPORT_PREFIX}/lib/libnncase.runtime.a" )

+# Import target "kendryte" for configuration "Release"
+set_property(TARGET kendryte APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(kendryte PROPERTIES
+  IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "C"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libkendryte.a"
+  )
+
+list(APPEND _IMPORT_CHECK_TARGETS kendryte )
+list(APPEND _IMPORT_CHECK_FILES_FOR_kendryte "${_IMPORT_PREFIX}/lib/libkendryte.a" )
+
 # Import target "nncase_rt_modules_k210" for configuration "Release"
 set_property(TARGET nncase_rt_modules_k210 APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
 set_target_properties(nncase_rt_modules_k210 PROPERTIES
--- a/lib/nncase/v1/lib/cmake/nncaseruntime/nncaseruntimeTargets.cmake
+++ b/lib/nncase/v1/lib/cmake/nncaseruntime/nncaseruntimeTargets.cmake
@ -4,7 +4,7 @@ if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.5)
   message(FATAL_ERROR "CMake >= 2.6.0 required")
 endif()
 cmake_policy(PUSH)
-cmake_policy(VERSION 2.6...3.18)
+cmake_policy(VERSION 2.6...3.19)
 #----------------------------------------------------------------
 # Generated CMake target import file.
 #----------------------------------------------------------------
@ -16,7 +16,7 @@ set(CMAKE_IMPORT_FILE_VERSION 1)
 set(_targetsDefined)
 set(_targetsNotDefined)
 set(_expectedTargets)
-foreach(_expectedTarget kernels runtime nncaseruntime runtime_stackvm kernels_k210 runtime_k210 nncase_rt_modules_k210)
+foreach(_expectedTarget kernels runtime nncaseruntime runtime_stackvm kernels_k210 kendryte runtime_k210 nncase_rt_modules_k210)
  list(APPEND _expectedTargets ${_expectedTarget})
  if(NOT TARGET ${_expectedTarget})
    list(APPEND _targetsNotDefined ${_expectedTarget})
@ -83,14 +83,17 @@ set_target_properties(runtime_stackvm PROPERTIES
 add_library(kernels_k210 INTERFACE IMPORTED)

 set_target_properties(kernels_k210 PROPERTIES
-  INTERFACE_LINK_LIBRARIES "nncaseruntime"
+  INTERFACE_LINK_LIBRARIES "nncaseruntime;\$<LINK_ONLY:kendryte>"
 )

+# Create imported target kendryte
+add_library(kendryte STATIC IMPORTED)
+
 # Create imported target runtime_k210
 add_library(runtime_k210 INTERFACE IMPORTED)

 set_target_properties(runtime_k210 PROPERTIES
-  INTERFACE_LINK_LIBRARIES "nncaseruntime"
+  INTERFACE_LINK_LIBRARIES "nncaseruntime;\$<LINK_ONLY:kernels_k210>;\$<LINK_ONLY:kendryte>"
 )

 # Create imported target nncase_rt_modules_k210
--- a/lib/nncase/v1/lib/libnncase.rt_modules.k210.a
+++ b/lib/nncase/v1/lib/libnncase.rt_modules.k210.a
--- a/lib/nncase/v1/lib/libnncase.runtime.a
+++ b/lib/nncase/v1/lib/libnncase.runtime.a