[nncase] Upload runtime

2019-11-21 12:42:05 +08:00 · 2019-11-21 12:42:05 +08:00 · a3ac928968
parent d60910fc00
commit a3ac928968
7 changed files with 379 additions and 22 deletions
--- a/lib/nncase/include/io_utils.h
+++ b/lib/nncase/include/io_utils.h
@ -27,6 +27,8 @@ inline std::vector<uint8_t> read_file(const std::filesystem::path &filename)

    infile.seekg(0, std::ios::end);
    size_t length = infile.tellg();
+    if (!length)
+        throw std::runtime_error("Invalid file: " + filename.string());
    infile.seekg(0, std::ios::beg);
    std::vector<uint8_t> data(length);
    infile.read(reinterpret_cast<char *>(data.data()), length);
--- a/lib/nncase/include/kernels/neutral/neutral_kernels.h
+++ b/lib/nncase/include/kernels/neutral/neutral_kernels.h
@ -48,6 +48,35 @@ namespace kernels
            }
        }

+        template <class TOp>
+        void quantized_binary(const uint8_t *input_a, const uint8_t *input_b, uint8_t *output, const runtime_shape_t &in_a_shape,
+            const runtime_shape_t &in_b_shape, const runtime_shape_t &out_shape, int32_t input_a_offset, int32_t input_a_mul, int32_t input_a_shift,
+            int32_t input_b_offset, int32_t input_b_mul, int32_t input_b_shift, int32_t output_mul, int32_t output_shift, int32_t output_offset, TOp &&op)
+        {
+            for (int32_t d0 = 0; d0 < out_shape[0]; d0++)
+            {
+                for (int32_t d1 = 0; d1 < out_shape[1]; d1++)
+                {
+                    for (int32_t d2 = 0; d2 < out_shape[2]; d2++)
+                    {
+                        for (int32_t d3 = 0; d3 < out_shape[3]; d3++)
+                        {
+                            runtime_shape_t in_off = { d0, d1, d2, d3 };
+                            const auto in_a_off = kernels::details::get_reduced_offset(in_off, in_a_shape);
+                            const auto in_b_off = kernels::details::get_reduced_offset(in_off, in_b_shape);
+                            auto a = (int32_t)input_a[offset(in_a_shape, in_a_off)];
+                            auto b = (int32_t)input_b[offset(in_b_shape, in_b_off)];
+                            a = runtime::mul_and_carry_shift(a + input_a_offset, input_a_mul, input_a_shift);
+                            b = runtime::mul_and_carry_shift(b + input_b_offset, input_b_mul, input_b_shift);
+
+                            auto output_val = runtime::mul_and_carry_shift(op(a, b), output_mul, output_shift);
+                            output[offset(out_shape, in_off)] = (uint8_t)std::clamp(output_val + output_offset, 0, 255);
+                        }
+                    }
+                }
+            }
+        }
+
        template <class TRange, class TPtrGetter = details::default_ptr_getter<uint8_t, TRange>>
        inline void concat(xtl::span<TRange> inputs, uint8_t *output, xtl::span<const int32_t> concat_dims, size_t inner_size, size_t outer_size, TPtrGetter getter = {})
        {
@ -125,6 +154,71 @@ namespace kernels
            }
        }

+        inline void quantized_conv2d(const uint8_t *input, uint8_t *output, const uint8_t *weights, const int32_t *bias, int32_t input_offset, int32_t filter_offset,
+            int32_t output_mul, int32_t output_shift, int32_t output_offset, const runtime_shape_t &in_shape, int32_t groups, int32_t out_channels,
+            int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
+            const padding &padding_h, const padding &padding_w)
+        {
+            const auto out_h = details::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
+            const auto out_w = details::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
+            const auto g_ic = in_shape[1] / groups;
+            const auto g_oc = out_channels / groups;
+
+            for (int32_t batch = 0; batch < in_shape[0]; batch++)
+            {
+                const uint8_t *in_batch_p = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
+
+                for (int32_t og = 0; og < groups; og++)
+                {
+                    const uint8_t *in_group_p = in_batch_p + (size_t)og * g_ic * in_shape[2] * in_shape[3];
+                    const uint8_t *w_group_p = weights + (size_t)og * g_oc * g_ic * filter_h * filter_w;
+
+                    for (int32_t oc = 0; oc < g_oc; oc++)
+                    {
+                        const uint8_t *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
+
+                        for (int32_t oy = 0; oy < out_h; oy++)
+                        {
+                            for (int32_t ox = 0; ox < out_w; ox++)
+                            {
+                                const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
+                                const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
+                                const int32_t filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
+                                const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
+                                const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
+                                const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
+                                int32_t value = bias[og * g_oc + oc];
+
+                                for (int32_t ic = 0; ic < g_ic; ic++)
+                                {
+                                    const uint8_t *in_c_p = in_group_p + (size_t)ic * in_shape[2] * in_shape[3];
+                                    const uint8_t *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
+
+                                    for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
+                                    {
+                                        for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
+                                        {
+                                            const int32_t in_y = in_y_origin + dilation_h * ky;
+                                            const int32_t in_x = in_x_origin + dilation_w * kx;
+
+                                            const int32_t in_v = (int32_t)in_c_p[in_y * in_shape[3] + in_x] + input_offset;
+                                            const int32_t w = (int32_t)w_ic_p[ky * filter_w + kx] + filter_offset;
+
+                                            value += in_v * w;
+                                        }
+                                    }
+                                }
+
+                                auto output_val = static_cast<int32_t>(runtime::mul_and_carry_shift(value, output_mul, output_shift));
+                                output_val += output_offset;
+                                *output++ = (uint8_t)std::clamp(output_val, 0, 255);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
        template <class TQ>
        void dequantize(const TQ *input, float *output, size_t count, const quant_param_t &param)
        {
@ -156,6 +250,28 @@ namespace kernels
            }
        }

+        inline void quantized_matmul(const uint8_t *input_a, const uint8_t *input_b, uint8_t *output, const int32_t *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, int32_t input_a_offset, int32_t input_b_offset,
+            int32_t output_mul, int32_t output_shift, int32_t output_offset)
+        {
+            for (size_t oy = 0; oy < a_rows; oy++)
+            {
+                for (size_t ox = 0; ox < b_cols; ox++)
+                {
+                    int32_t value = bias[ox];
+                    for (size_t i = 0; i < a_cols; i++)
+                    {
+                        const auto a = (int32_t)input_a[oy * a_cols + i] + input_a_offset;
+                        const auto b = (int32_t)input_b[i * b_cols + ox] + input_b_offset;
+                        value += a * b;
+                    }
+
+                    auto output_val = static_cast<int32_t>(runtime::mul_and_carry_shift(value, output_mul, output_shift));
+                    output_val += output_offset;
+                    output[oy * b_cols + ox] = (uint8_t)std::clamp(output_val, 0, 255);
+                }
+            }
+        }
+
        template <class T>
        void pad(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_paddings_t &paddings, T pad_value)
        {
@ -313,7 +429,8 @@ namespace kernels
            }
        }

-        inline void resize_bilinear(const float *input, float *output, const runtime_shape_t &in_shape, int32_t out_h, int32_t out_w, bool align_corners)
+        template <class T>
+        inline void resize_bilinear(const T *input, T *output, const runtime_shape_t &in_shape, int32_t out_h, int32_t out_w, bool align_corners)
        {
            auto height_scale = (float)in_shape[2] / out_h;
            auto width_scale = (float)in_shape[3] / out_w;
@ -353,7 +470,7 @@ namespace kernels
                            auto a2 = (1 - (in_y - in_y0)) * (in_x - in_x0);
                            auto a3 = (in_y - in_y0) * (in_x - in_x0);

-                            output[destIdx++] = v0 * a0 + v1 * a1 + v2 * a2 + v3 * a3;
+                            output[destIdx++] = T(v0 * a0 + v1 * a1 + v2 * a2 + v3 * a3);
                        }
                    }
                }
--- a/lib/nncase/include/runtime/neutral/neutral_ops_body.h
+++ b/lib/nncase/include/runtime/neutral/neutral_ops_body.h
@ -33,6 +33,26 @@ namespace runtime
            value_range<float> fused_activation;
        };

+        struct quantized_binary_options : public simple_node_body<quantized_binary_options>
+        {
+            memory_range input_a;
+            memory_range input_b;
+            memory_range output;
+            binary_op_t binary_op;
+            runtime_shape_t in_a_shape;
+            runtime_shape_t in_b_shape;
+            runtime_shape_t out_shape;
+            int32_t input_a_offset;
+            int32_t input_a_mul;
+            int32_t input_a_shift;
+            int32_t input_b_offset;
+            int32_t input_b_mul;
+            int32_t input_b_shift;
+            int32_t output_offset;
+            int32_t output_mul;
+            int32_t output_shift;
+        };
+
        struct concat_options
        {
            memory_range output;
@ -123,6 +143,78 @@ namespace runtime
            }
        };

+        struct quantized_conv2d_options
+        {
+            memory_range input;
+            memory_range output;
+            runtime_shape_t in_shape;
+            int32_t groups;
+            int32_t out_channels;
+            padding padding_h;
+            padding padding_w;
+            int32_t filter_h;
+            int32_t filter_w;
+            int32_t stride_h;
+            int32_t stride_w;
+            int32_t dilation_h;
+            int32_t dilation_w;
+            int32_t input_offset;
+            int32_t filter_offset;
+            int32_t output_mul;
+            int32_t output_shift;
+            int32_t output_offset;
+            xtl::span<const int32_t> bias;
+            xtl::span<const uint8_t> weights;
+
+            void deserialize(span_reader &reader)
+            {
+                reader.read(input);
+                reader.read(output);
+                reader.read(in_shape);
+                reader.read(groups);
+                reader.read(out_channels);
+                reader.read(padding_h);
+                reader.read(padding_w);
+                reader.read(filter_h);
+                reader.read(filter_w);
+                reader.read(stride_h);
+                reader.read(stride_w);
+                reader.read(dilation_h);
+                reader.read(dilation_w);
+                reader.read(input_offset);
+                reader.read(filter_offset);
+                reader.read(output_mul);
+                reader.read(output_shift);
+                reader.read(output_offset);
+                reader.read_span(bias, out_channels);
+                reader.read_span(weights, (size_t)out_channels * in_shape[1] / groups * filter_h * filter_w);
+            }
+
+            void serialize(binary_writer &writer) const
+            {
+                writer.write(input);
+                writer.write(output);
+                writer.write(in_shape);
+                writer.write(groups);
+                writer.write(out_channels);
+                writer.write(padding_h);
+                writer.write(padding_w);
+                writer.write(filter_h);
+                writer.write(filter_w);
+                writer.write(stride_h);
+                writer.write(stride_w);
+                writer.write(dilation_h);
+                writer.write(dilation_w);
+                writer.write(input_offset);
+                writer.write(filter_offset);
+                writer.write(output_mul);
+                writer.write(output_shift);
+                writer.write(output_offset);
+                writer.write_array(bias);
+                writer.write_array(weights);
+            }
+        };
+
        struct dequantize_options : public simple_node_body<dequantize_options>
        {
            memory_range input;
@ -166,6 +258,54 @@ namespace runtime
            }
        };

+        struct quantized_matmul_options
+        {
+            memory_range input_a;
+            memory_range input_b;
+            memory_range output;
+            int32_t a_rows;
+            int32_t a_cols;
+            int32_t b_cols;
+            int32_t input_a_offset;
+            int32_t input_b_offset;
+            int32_t output_mul;
+            int32_t output_shift;
+            int32_t output_offset;
+            xtl::span<const int32_t> bias;
+
+            void deserialize(span_reader &reader)
+            {
+                reader.read(input_a);
+                reader.read(input_b);
+                reader.read(output);
+                reader.read(a_rows);
+                reader.read(a_cols);
+                reader.read(b_cols);
+                reader.read(input_a_offset);
+                reader.read(input_b_offset);
+                reader.read(output_mul);
+                reader.read(output_shift);
+                reader.read(output_offset);
+                reader.read_span(bias, b_cols);
+            }
+
+            void serialize(binary_writer &writer) const
+            {
+                writer.write(input_a);
+                writer.write(input_b);
+                writer.write(output);
+                writer.write(a_rows);
+                writer.write(a_cols);
+                writer.write(b_cols);
+                writer.write(input_a_offset);
+                writer.write(input_b_offset);
+                writer.write(output_mul);
+                writer.write(output_shift);
+                writer.write(output_offset);
+                writer.write_array(bias);
+            }
+        };
+
        struct memory_copy_options : public simple_node_body<memory_copy_options>
        {
            memory_range input;
--- a/lib/nncase/include/runtime/runtime_op.def
+++ b/lib/nncase/include/runtime/runtime_op.def
@ -1,19 +1,22 @@
 BEGINE_DEFINE_TARGET(neutral)
-    DEFINE_NEUTRAL_RUNTIME_OP(binary,           Binary,         0x0)
-    DEFINE_NEUTRAL_RUNTIME_OP(concat,           Concat,         0x1)
-    DEFINE_NEUTRAL_RUNTIME_OP(conv2d,           Conv2D,         0x2)
-    DEFINE_NEUTRAL_RUNTIME_OP(dequantize,       Dequantize,     0x3)
-    DEFINE_NEUTRAL_RUNTIME_OP(matmul,           MatMul,         0x4)
-    DEFINE_NEUTRAL_RUNTIME_OP(pad,              Pad,            0x5)
-    DEFINE_NEUTRAL_RUNTIME_OP(quantize,         Quantize,       0x6)
-    DEFINE_NEUTRAL_RUNTIME_OP(reduce,           Reduce,         0x7)
-    DEFINE_NEUTRAL_RUNTIME_OP(reduce_window2d,  ReduceWindow2D, 0x8)
-    DEFINE_NEUTRAL_RUNTIME_OP(memory_copy,      MemoryCopy,     0x9)
-    DEFINE_NEUTRAL_RUNTIME_OP(resize_image,     ResizeImage,    0x0A)
-    DEFINE_NEUTRAL_RUNTIME_OP(softmax,          Softmax,        0x0B)
-    DEFINE_NEUTRAL_RUNTIME_OP(transpose,        Transpose,      0x0C)
-    DEFINE_NEUTRAL_RUNTIME_OP(strided_slice,    StridedSlice,   0x0D)
-    DEFINE_NEUTRAL_RUNTIME_OP(unary,            Unary,          0x0E)
+    DEFINE_NEUTRAL_RUNTIME_OP(binary,           Binary,				0x0)
+    DEFINE_NEUTRAL_RUNTIME_OP(concat,           Concat,				0x1)
+    DEFINE_NEUTRAL_RUNTIME_OP(conv2d,           Conv2D,				0x2)
+    DEFINE_NEUTRAL_RUNTIME_OP(dequantize,       Dequantize,			0x3)
+    DEFINE_NEUTRAL_RUNTIME_OP(matmul,           MatMul,				0x4)
+    DEFINE_NEUTRAL_RUNTIME_OP(pad,              Pad,				0x5)
+    DEFINE_NEUTRAL_RUNTIME_OP(quantize,         Quantize,			0x6)
+    DEFINE_NEUTRAL_RUNTIME_OP(reduce,           Reduce,				0x7)
+    DEFINE_NEUTRAL_RUNTIME_OP(reduce_window2d,  ReduceWindow2D,		0x8)
+    DEFINE_NEUTRAL_RUNTIME_OP(memory_copy,      MemoryCopy,			0x9)
+    DEFINE_NEUTRAL_RUNTIME_OP(resize_image,     ResizeImage,		0x0A)
+    DEFINE_NEUTRAL_RUNTIME_OP(softmax,          Softmax,			0x0B)
+    DEFINE_NEUTRAL_RUNTIME_OP(transpose,        Transpose,			0x0C)
+    DEFINE_NEUTRAL_RUNTIME_OP(strided_slice,    StridedSlice,		0x0D)
+    DEFINE_NEUTRAL_RUNTIME_OP(unary,            Unary,				0x0E)
+    DEFINE_NEUTRAL_RUNTIME_OP(quantized_conv2d, QuantizedConv2D,	0x0F)
+    DEFINE_NEUTRAL_RUNTIME_OP(quantized_matmul, QuantizedMatMul,	0x10)
+    DEFINE_NEUTRAL_RUNTIME_OP(quantized_binary, QuantizedBinary,	0x11)
 END_DEFINE_TARGET()

 // CPU
--- a/lib/nncase/include/runtime/runtime_op_utility.h
+++ b/lib/nncase/include/runtime/runtime_op_utility.h
@ -123,6 +123,14 @@ namespace runtime
        return (int32_t)carry_shift<int64_t, Banker>((int64_t)value * mul, shift);
    }

+    template <uint8_t Bits>
+    inline int32_t clamp(int32_t value)
+    {
+        auto min = std::numeric_limits<int32_t>::lowest() >> (32 - Bits);
+        auto max = std::numeric_limits<int32_t>::max() >> (32 - Bits);
+        return std::clamp(value, min, max);
+    }
+
    template <class T>
    struct to_datatype
    {
--- a/lib/nncase/include/targets/target.h
+++ b/lib/nncase/include/targets/target.h
@ -18,13 +18,22 @@
 #include <scheduler/memory_allocator.h>
 #include <transforms/transform.h>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>

 namespace nncase
 {
+struct target_options
+{
+    std::string input_type;
+};
+
 class target
 {
 public:
+    target(const target_options &options)
+        : options_(options) {}
+
    virtual void fill_allocators(std::unordered_map<memory_type_t, scheduler::memory_allocator *> &allocators, std::vector<std::unique_ptr<scheduler::memory_allocator>> &allocator_holders) = 0;
    virtual void registry_codegen_ops() = 0;
    virtual void registry_evaluator_ops() = 0;
@ -32,6 +41,10 @@ public:
    virtual void add_optimize1_transforms(std::vector<std::unique_ptr<transforms::transform>> &transforms) = 0;
    virtual void add_optimize2_transforms(std::vector<std::unique_ptr<transforms::transform>> &transforms) = 0;
    virtual void add_quantization_checkpoint_transforms(std::vector<std::unique_ptr<transforms::transform>> &transforms) = 0;
-    virtual void add_quantization_transforms(ir::quantizer& quantizer, const quant_param_t& input_quant_param, std::vector<std::unique_ptr<transforms::transform>> &transforms) = 0;
+    virtual void add_quantization_transforms(ir::quantizer &quantizer, std::vector<std::unique_ptr<transforms::transform>> &transforms) = 0;
+    virtual void add_quantization_broadcast(std::unordered_set<ir::node_opcode> &opcodes) = 0;
+
+protected:
+    target_options options_;
 };
 }
--- a/lib/nncase/runtime/neutral/neutral_ops.cpp
+++ b/lib/nncase/runtime/neutral/neutral_ops.cpp
@ -35,6 +35,19 @@ using namespace nncase::runtime;
        return kcr_error;             \
    }

+#define FP_OR_Q_IMPL(type, KERNEL) \
+    switch (type)                  \
+    {                              \
+    case dt_float32:               \
+        KERNEL(float);             \
+        break;                     \
+    case dt_uint8:                 \
+        KERNEL(uint8_t);           \
+        break;                     \
+    default:                       \
+        return kcr_error;          \
+    }
+
 namespace nncase
 {
 namespace runtime
@ -76,6 +89,43 @@ namespace runtime
            }
        }

+        kernel_call_result quantized_binary(quantized_binary_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input_a = interpreter.memory_at<uint8_t>(options.input_a);
+            auto input_b = interpreter.memory_at<uint8_t>(options.input_b);
+            auto output = interpreter.memory_at<uint8_t>(options.output);
+
+            auto binary = [&](auto op) {
+                kernels::neutral::quantized_binary(input_a.data(), input_b.data(), output.data(), options.in_a_shape, options.in_b_shape, options.out_shape,
+                    options.input_a_offset, options.input_a_mul, options.input_a_shift, options.input_b_offset, options.input_b_mul, options.input_b_shift,
+                    options.output_mul, options.output_shift, options.output_offset, op);
+            };
+
+            switch (options.binary_op)
+            {
+            case binary_add:
+                binary([](auto a, auto b) { return a + b; });
+                return kcr_done;
+            case binary_sub:
+                binary([](auto a, auto b) { return a - b; });
+                return kcr_done;
+            case binary_mul:
+                binary([](auto a, auto b) { return a * b; });
+                return kcr_done;
+            case binary_div:
+                binary([](auto a, auto b) { return (a + b / 2) / b; });
+                return kcr_done;
+            case binary_min:
+                binary([](auto a, auto b) { return std::min(a, b); });
+                return kcr_done;
+            case binary_max:
+                binary([](auto a, auto b) { return std::max(a, b); });
+                return kcr_done;
+            default:
+                return kcr_error;
+            }
+        }
+
        kernel_call_result concat(concat_options &options, interpreter_t &interpreter, interpreter_step_t step)
        {
            auto output = interpreter.memory_at<uint8_t>(options.output);
@ -93,6 +143,16 @@ namespace runtime
            return kcr_done;
        }

+        kernel_call_result quantized_conv2d(quantized_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input = interpreter.memory_at<uint8_t>(options.input);
+            auto output = interpreter.memory_at<uint8_t>(options.output);
+            kernels::neutral::quantized_conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.input_offset, options.filter_offset,
+                options.output_mul, options.output_shift, options.output_offset, options.in_shape, options.groups, options.out_channels, options.filter_h,
+                options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w);
+            return kcr_done;
+        }
+
        kernel_call_result dequantize(dequantize_options &options, interpreter_t &interpreter, interpreter_step_t step)
        {
            auto input = interpreter.memory_at<uint8_t>(options.input);
@ -111,6 +171,16 @@ namespace runtime
            return kcr_done;
        }

+        kernel_call_result quantized_matmul(quantized_matmul_options &options, interpreter_t &interpreter, interpreter_step_t step)
+        {
+            auto input_a = interpreter.memory_at<uint8_t>(options.input_a);
+            auto input_b = interpreter.memory_at<uint8_t>(options.input_b);
+            auto output = interpreter.memory_at<uint8_t>(options.output);
+            kernels::neutral::quantized_matmul(input_a.data(), input_b.data(), output.data(), options.bias.data(), options.a_rows, options.a_cols, options.b_cols,
+                options.input_a_offset, options.input_b_offset, options.output_mul, options.output_shift, options.output_offset);
+            return kcr_done;
+        }
+
        kernel_call_result memory_copy(memory_copy_options &options, interpreter_t &interpreter, interpreter_step_t step)
        {
            auto input = interpreter.memory_at<float>(options.input);
@ -205,20 +275,24 @@ namespace runtime

        kernel_call_result resize_image(resize_image_options &options, interpreter_t &interpreter, interpreter_step_t step)
        {
-            auto input = interpreter.memory_at<float>(options.input);
-            auto output = interpreter.memory_at<float>(options.output);
+            auto input = interpreter.memory_at<uint8_t>(options.input);
+            auto output = interpreter.memory_at<uint8_t>(options.output);

            if (options.mode == image_resize_bilinear)
            {
-                kernels::neutral::resize_bilinear(input.data(), output.data(), options.in_shape, options.out_h, options.out_w, options.align_corners);
+#define RESIZE_BL_KERNEL(T) \
+    kernels::neutral::resize_bilinear(reinterpret_cast<const T *>(input.data()), reinterpret_cast<T *>(output.data()), options.in_shape, options.out_h, options.out_w, options.align_corners);
+
+                FP_OR_Q_IMPL(options.input.datatype, RESIZE_BL_KERNEL);
                return kcr_done;
+#undef RESIZE_BL_KERNEL
            }
            else
            {
 #define RESIZE_NN_KERNEL(T) \
    kernels::neutral::resize_nearest_neighbor(reinterpret_cast<const T *>(input.data()), reinterpret_cast<T *>(output.data()), options.in_shape, options.out_h, options.out_w);

-                ELEM_SIZE_IMPL(options.input.datatype, RESIZE_NN_KERNEL);
+                FP_OR_Q_IMPL(options.input.datatype, RESIZE_NN_KERNEL);
                return kcr_done;
 #undef RESIZE_NN_KERNEL
            }