
803 lines
32 KiB

/* Copyright 2019-2021 Canaan Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
#pragma once
#include "../kernel_utils.h"
#include <cmath>
#include <nncase/runtime/nnil.h>
#include <nncase/runtime/runtime_op_utility.h>
#include <xtl/xspan.hpp>
#ifdef __riscv
#include "../riscv/neutral_kernels.h"
namespace nncase::kernels::neutral
template <class TOp, class TShape>
void binary(const float *input_a, const float *input_b, float *output, const TShape &in_a_shape,
const TShape &in_b_shape, const TShape &out_shape, const value_range<float> &fused_activation, TOp &&op)
// opt. no broadcast
if (in_a_shape == in_b_shape)
auto size = kernels::detail::compute_size(in_a_shape);
for (size_t i = 0; i < size; i++)
const auto a = input_a[i];
const auto b = input_b[i];
output[i] = kernels::detail::apply_activation(op(a, b), fused_activation);
// fallback
for (size_t d0 = 0; d0 < out_shape[0]; d0++)
for (size_t d1 = 0; d1 < out_shape[1]; d1++)
for (size_t d2 = 0; d2 < out_shape[2]; d2++)
for (size_t d3 = 0; d3 < out_shape[3]; d3++)
TShape in_off = { d0, d1, d2, d3 };
const auto in_a_off = kernels::detail::get_reduced_offset(in_off, in_a_shape);
const auto in_b_off = kernels::detail::get_reduced_offset(in_off, in_b_shape);
const auto a = input_a[offset(in_a_shape, in_a_off)];
const auto b = input_b[offset(in_b_shape, in_b_off)];
output[offset(out_shape, in_off)] = kernels::detail::apply_activation(op(a, b), fused_activation);
template <class TOp, class TShape>
void quantized_binary(const uint8_t *input_a, const uint8_t *input_b, uint8_t *output, const TShape &in_a_shape,
const TShape &in_b_shape, const TShape &out_shape, int32_t input_a_offset, int32_t input_a_mul, int32_t input_a_shift,
int32_t input_b_offset, int32_t input_b_mul, int32_t input_b_shift, int32_t output_mul, int32_t output_shift, int32_t output_offset, TOp &&op)
// opt. no broadcast
if (in_a_shape == in_b_shape)
auto size = kernels::detail::compute_size(in_a_shape);
for (size_t i = 0; i < size; i++)
auto a = (int32_t)input_a[i];
auto b = (int32_t)input_b[i];
a = runtime::mul_and_carry_shift(a + input_a_offset, input_a_mul, input_a_shift);
b = runtime::mul_and_carry_shift(b + input_b_offset, input_b_mul, input_b_shift);
auto output_val = runtime::mul_and_carry_shift(op(a, b), output_mul, output_shift);
output[i] = (uint8_t)std::clamp(output_val + output_offset, 0, 255);
// fallback
for (int32_t d0 = 0; d0 < out_shape[0]; d0++)
for (int32_t d1 = 0; d1 < out_shape[1]; d1++)
for (int32_t d2 = 0; d2 < out_shape[2]; d2++)
for (int32_t d3 = 0; d3 < out_shape[3]; d3++)
TShape in_off = { d0, d1, d2, d3 };
const auto in_a_off = kernels::detail::get_reduced_offset(in_off, in_a_shape);
const auto in_b_off = kernels::detail::get_reduced_offset(in_off, in_b_shape);
auto a = (int32_t)input_a[offset(in_a_shape, in_a_off)];
auto b = (int32_t)input_b[offset(in_b_shape, in_b_off)];
a = runtime::mul_and_carry_shift(a + input_a_offset, input_a_mul, input_a_shift);
b = runtime::mul_and_carry_shift(b + input_b_offset, input_b_mul, input_b_shift);
auto output_val = runtime::mul_and_carry_shift(op(a, b), output_mul, output_shift);
output[offset(out_shape, in_off)] = (uint8_t)std::clamp(output_val + output_offset, 0, 255);
template <class TRange, class TPtrGetter = detail::default_ptr_getter<uint8_t, TRange>>
inline void concat(xtl::span<TRange> inputs, uint8_t *output, xtl::span<const int32_t> concat_dims, size_t inner_size, size_t outer_size, TPtrGetter getter = {})
for (size_t oc = 0; oc < outer_size; oc++)
for (size_t i = 0; i < inputs.size(); i++)
auto size = inner_size * concat_dims[i];
auto src = getter(inputs[i]) + oc * size;
std::copy(src, src + size, output);
output += size;
template <class TShape>
void conv2d(const float *input, float *output, const float *weights, const float *bias, const TShape &in_shape,
int32_t groups, int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
const auto out_h = detail::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
const auto out_w = detail::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
const auto g_ic = in_shape[1] / groups;
const auto g_oc = (size_t)out_channels / groups;
for (size_t batch = 0; batch < in_shape[0]; batch++)
const float *in_batch_p = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
for (size_t og = 0; og < (size_t)groups; og++)
const float *in_group_p = in_batch_p + (size_t)og * g_ic * in_shape[2] * in_shape[3];
const float *w_group_p = weights + (size_t)og * g_oc * g_ic * filter_h * filter_w;
for (size_t oc = 0; oc < g_oc; oc++)
const float *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
for (size_t oy = 0; oy < out_h; oy++)
for (size_t ox = 0; ox < out_w; ox++)
const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
const size_t filter_y_start = (size_t)std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
const size_t filter_y_end = (size_t)std::min(filter_h, ((int32_t)in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
const size_t filter_x_start = (size_t)std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
const size_t filter_x_end = (size_t)std::min(filter_w, ((int32_t)in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
float value = bias[og * g_oc + oc];
for (size_t ic = 0; ic < g_ic; ic++)
const float *in_c_p = in_group_p + (size_t)ic * in_shape[2] * in_shape[3];
const float *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
for (size_t ky = filter_y_start; ky < filter_y_end; ky++)
for (size_t kx = filter_x_start; kx < filter_x_end; kx++)
const size_t in_y = in_y_origin + dilation_h * ky;
const size_t in_x = in_x_origin + dilation_w * kx;
const float in_v = in_c_p[in_y * in_shape[3] + in_x];
const float w = w_ic_p[ky * filter_w + kx];
value += in_v * w;
*output++ = detail::apply_activation(value, fused_activation);
template <class TShape>
void quantized_conv2d(const uint8_t *input, uint8_t *output, const uint8_t *weights, const int32_t *bias, int32_t input_offset, int32_t filter_offset,
int32_t output_mul, int32_t output_shift, int32_t output_offset, const TShape &in_shape, int32_t groups, int32_t out_channels,
int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
const padding &padding_h, const padding &padding_w)
const auto out_h = detail::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
const auto out_w = detail::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
const auto g_ic = in_shape[1] / groups;
const auto g_oc = out_channels / groups;
for (int32_t batch = 0; batch < in_shape[0]; batch++)
const uint8_t *in_batch_p = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
for (int32_t og = 0; og < groups; og++)
const uint8_t *in_group_p = in_batch_p + (size_t)og * g_ic * in_shape[2] * in_shape[3];
const uint8_t *w_group_p = weights + (size_t)og * g_oc * g_ic * filter_h * filter_w;
for (int32_t oc = 0; oc < g_oc; oc++)
const uint8_t *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
for (int32_t oy = 0; oy < out_h; oy++)
for (int32_t ox = 0; ox < out_w; ox++)
const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
const int32_t filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
int32_t value = bias[og * g_oc + oc];
for (int32_t ic = 0; ic < g_ic; ic++)
const uint8_t *in_c_p = in_group_p + (size_t)ic * in_shape[2] * in_shape[3];
const uint8_t *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
const int32_t in_y = in_y_origin + dilation_h * ky;
const int32_t in_x = in_x_origin + dilation_w * kx;
const int32_t in_v = (int32_t)in_c_p[in_y * in_shape[3] + in_x] + input_offset;
const int32_t w = (int32_t)w_ic_p[ky * filter_w + kx] + filter_offset;
value += in_v * w;
auto output_val = static_cast<int32_t>(runtime::mul_and_carry_shift(value, output_mul, output_shift));
output_val += output_offset;
*output++ = (uint8_t)std::clamp(output_val, 0, 255);
template <class TShape>
void conv2d_transpose(const float *input, float *output, const float *weights, [[maybe_unused]] const float *bias, const TShape &in_shape,
int32_t groups, const TShape &out_shape, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
std::fill(output, output + kernels::detail::compute_size(out_shape), 0.f);
const auto g_ic = in_shape[1] / groups;
const auto g_oc = out_shape[1] / groups;
for (size_t batch = 0; batch < in_shape[0]; batch++)
float *out_batch_p = output + (size_t)batch * out_shape[1] * out_shape[2] * out_shape[3];
for (size_t g = 0; g < (size_t)groups; g++)
float *out_group_p = out_batch_p + (size_t)g * g_oc * out_shape[2] * out_shape[3];
const float *w_group_p = weights + (size_t)g * g_oc * g_ic * filter_h * filter_w;
for (size_t ic = 0; ic < g_ic; ic++)
for (size_t iy = 0; iy < in_shape[2]; iy++)
for (size_t ix = 0; ix < in_shape[3]; ix++)
const int32_t out_y_origin = (iy * stride_h) - padding_h.before;
const int32_t out_x_origin = (ix * stride_w) - padding_w.before;
const size_t filter_y_start = (size_t)std::max(0, (-out_y_origin + dilation_h - 1) / dilation_h);
const size_t filter_y_end = (size_t)std::min(filter_h, ((int32_t)out_shape[2] - out_y_origin + dilation_h - 1) / dilation_h);
const size_t filter_x_start = (size_t)std::max(0, (-out_x_origin + dilation_w - 1) / dilation_w);
const size_t filter_x_end = (size_t)std::min(filter_w, ((int32_t)out_shape[3] - out_x_origin + dilation_w - 1) / dilation_w);
const float in_v = *input++;
for (size_t oc = 0; oc < g_oc; oc++)
assert(bias[g * g_oc + oc] == 0.f);
float *out_c_p = out_group_p + (size_t)oc * out_shape[2] * out_shape[3];
const float *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
const float *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
for (size_t ky = filter_y_start; ky < filter_y_end; ky++)
for (size_t kx = filter_x_start; kx < filter_x_end; kx++)
const int32_t out_y = out_y_origin + dilation_h * ky;
const int32_t out_x = out_x_origin + dilation_w * kx;
const float w = w_ic_p[ky * filter_w + kx];
out_c_p[out_y * out_shape[3] + out_x] += in_v * w;
if (fused_activation != value_range<float>::full())
for (size_t i = 0; i < kernels::detail::compute_size(out_shape); i++)
output[i] = detail::apply_activation(output[i], fused_activation);
template <class TQ>
void dequantize(const TQ *CXX_RESTRICT input, float *CXX_RESTRICT output, size_t count, const quant_param_t &param)
#if __riscv
riscv_dequantize(input, output, count, param);
for (size_t i = 0; i < count; i++)
output[i] = (input[i] - param.zero_point) * param.scale;
inline void matmul(const float *input_a, const float *input_b, float *output, const float *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, const value_range<float> &fused_activation)
for (int32_t oy = 0; oy < a_rows; oy++)
for (int32_t ox = 0; ox < b_cols; ox++)
float value = bias[ox];
for (int32_t i = 0; i < a_cols; i++)
const auto a = input_a[oy * a_cols + i];
const auto b = input_b[i * b_cols + ox];
value += a * b;
output[oy * b_cols + ox] = detail::apply_activation(value, fused_activation);
inline void quantized_matmul(const uint8_t *input_a, const uint8_t *input_b, uint8_t *output, const int32_t *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, int32_t input_a_offset, int32_t input_b_offset,
int32_t output_mul, int32_t output_shift, int32_t output_offset)
for (int32_t oy = 0; oy < a_rows; oy++)
for (int32_t ox = 0; ox < b_cols; ox++)
int32_t value = bias[ox];
for (int32_t i = 0; i < a_cols; i++)
const auto a = (int32_t)input_a[oy * a_cols + i] + input_a_offset;
const auto b = (int32_t)input_b[i * b_cols + ox] + input_b_offset;
value += a * b;
auto output_val = static_cast<int32_t>(runtime::mul_and_carry_shift(value, output_mul, output_shift));
output_val += output_offset;
output[oy * b_cols + ox] = (uint8_t)std::clamp(output_val, 0, 255);
template <class T, class TShape, class TPaddings>
void pad(const T *input, T *output, const TShape &in_shape, const TPaddings &paddings, T pad_value)
TShape out_shape = { in_shape[0] + paddings[0].sum(),
in_shape[1] + paddings[1].sum(),
in_shape[2] + paddings[2].sum(),
in_shape[3] + paddings[3].sum() };
for (int d0 = 0; d0 < out_shape[0]; d0++)
auto d0_origin = -paddings[0].before;
auto in0 = input + ((size_t)d0_origin + d0) * in_shape[1] * in_shape[2] * in_shape[3];
for (int d1 = 0; d1 < out_shape[1]; d1++)
auto d1_origin = -paddings[1].before;
auto in1 = in0 + ((size_t)d1_origin + d1) * in_shape[2] * in_shape[3];
for (int d2 = 0; d2 < out_shape[2]; d2++)
auto d2_origin = -paddings[2].before;
auto in2 = in1 + ((size_t)d2_origin + d2) * in_shape[3];
for (int d3 = 0; d3 < out_shape[3]; d3++)
auto d3_origin = -paddings[3].before;
if (d0 < paddings[0].before || d0 >= out_shape[0] - paddings[0].after
|| d1 < paddings[1].before || d1 >= out_shape[1] - paddings[1].after
|| d2 < paddings[2].before || d2 >= out_shape[2] - paddings[2].after
|| d3 < paddings[3].before || d3 >= out_shape[3] - paddings[3].after)
*output++ = pad_value;
*output++ = in2[d3_origin + d3];
template <class TQ>
void quantize(const float *CXX_RESTRICT input, TQ *CXX_RESTRICT output, size_t count, const quant_param_t &param)
#if __riscv
riscv_quantize(input, output, count, param);
for (size_t i = 0; i < count; i++)
auto v = (int32_t)std::nearbyintf(input[i] / param.scale + param.zero_point);
output[i] = (TQ)std::clamp(v, (int32_t)std::numeric_limits<TQ>::lowest(), (int32_t)std::numeric_limits<TQ>::max());
template <class TReducer, class TShape>
void reduce(const float *input, float *output, float init_value, const TShape &in_shape, const TShape &reduced_shape, TReducer &&reducer)
std::fill(output, output + kernels::detail::compute_size(reduced_shape), init_value);
for (size_t d0 = 0; d0 < in_shape[0]; d0++)
for (size_t d1 = 0; d1 < in_shape[1]; d1++)
for (size_t d2 = 0; d2 < in_shape[2]; d2++)
for (size_t d3 = 0; d3 < in_shape[3]; d3++)
runtime_shape_t in_off = { d0, d1, d2, d3 };
auto out_off = kernels::detail::get_reduced_offset(in_off, reduced_shape);
const auto a = input[offset(in_shape, in_off)];
auto &b = output[offset(reduced_shape, out_off)];
b = reducer(b, a);
template <class TOp>
void unary(const float *CXX_RESTRICT input, float *CXX_RESTRICT output, size_t count, TOp &&op)
for (size_t i = 0; i < count; i++)
output[i] = op(input[i]);
template <class TBinaryOp, class TOutputOp, class TShape>
void reduce_window2d(const float *input, float *output, float init_value, const TShape &in_shape, int32_t filter_h, int32_t filter_w,
int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, const padding &padding_h, const padding &padding_w,
const value_range<float> &fused_activation, TBinaryOp &&binary_op, TOutputOp &&window_op)
const auto out_h = kernels::detail::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
const auto out_w = kernels::detail::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
runtime_shape_t out_shape { in_shape[0], in_shape[1], out_h, out_w };
for (size_t batch = 0; batch < in_shape[0]; batch++)
for (size_t oc = 0; oc < in_shape[1]; oc++)
for (size_t oy = 0; oy < out_h; oy++)
for (size_t ox = 0; ox < out_w; ox++)
const int32_t in_y_origin = ((int32_t)oy * stride_h) - padding_h.before;
const int32_t in_x_origin = ((int32_t)ox * stride_w) - padding_w.before;
const size_t filter_y_start = (size_t)std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
const size_t filter_y_end = (size_t)std::min(filter_h, ((int32_t)in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
const size_t filter_x_start = (size_t)std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
const size_t filter_x_end = (size_t)std::min(filter_w, ((int32_t)in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
float value = init_value;
int32_t kernel_count = 0;
for (size_t ky = filter_y_start; ky < filter_y_end; ky++)
for (size_t kx = filter_x_start; kx < filter_x_end; kx++)
const size_t in_y = in_y_origin + dilation_h * ky;
const size_t in_x = in_x_origin + dilation_w * kx;
const float in_v = input[offset(in_shape, { batch, oc, in_y, in_x })];
value = binary_op(value, in_v);
output[offset(out_shape, { batch, oc, oy, ox })] = kernels::detail::apply_activation(window_op(value, kernel_count), fused_activation);
template <class T, class TShape>
void resize_nearest_neighbor(const T *input, T *output, const TShape &in_shape, int32_t out_h, int32_t out_w)
auto height_scale = (float)in_shape[2] / out_h;
auto width_scale = (float)in_shape[3] / out_w;
for (size_t batch = 0; batch < in_shape[0]; batch++)
auto in_batch = input + batch * in_shape[1] * in_shape[2] * in_shape[3];
for (size_t oc = 0; oc < in_shape[1]; oc++)
auto in_c = in_batch + oc * in_shape[2] * in_shape[3];
for (size_t oy = 0; oy < (size_t)out_h; oy++)
auto in_y = std::min((size_t)floorf(oy * height_scale), in_shape[2] - 1);
auto in_row = in_c + in_y * in_shape[3];
for (size_t ox = 0; ox < (size_t)out_w; ox++)
auto in_x = std::min((size_t)floorf(ox * width_scale), in_shape[3] - 1);
*output++ = in_row[in_x];
template <class T, class TShape>
inline void resize_bilinear(const T *input, T *output, const TShape &in_shape, int32_t out_h, int32_t out_w, bool align_corners)
auto height_scale = (float)in_shape[2] / out_h;
auto width_scale = (float)in_shape[3] / out_w;
if (align_corners && out_h > 1)
height_scale = (float)(in_shape[2] - 1) / (out_h - 1);
if (align_corners && out_w > 1)
width_scale = (float)(in_shape[3] - 1) / (out_w - 1);
auto destIdx = 0;
for (size_t batch = 0; batch < in_shape[0]; batch++)
auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
for (size_t oc = 0; oc < in_shape[1]; oc++)
auto in_c = in_batch + (size_t)oc * in_shape[2] * in_shape[3];
for (size_t oy = 0; oy < (size_t)out_h; oy++)
auto in_y = oy * height_scale;
auto in_y0 = (size_t)floorf(in_y);
auto in_y1 = std::min(in_y0 + 1, in_shape[2] - 1);
for (size_t ox = 0; ox < (size_t)out_w; ox++)
auto in_x = ox * width_scale;
auto in_x0 = (size_t)floorf(in_x);
auto in_x1 = std::min(in_x0 + 1, in_shape[3] - 1);
auto v0 = in_c[in_y0 * in_shape[3] + in_x0];
auto v1 = in_c[in_y1 * in_shape[3] + in_x0];
auto v2 = in_c[in_y0 * in_shape[3] + in_x1];
auto v3 = in_c[in_y1 * in_shape[3] + in_x1];
auto a0 = (1 - (in_y - in_y0)) * (1 - (in_x - in_x0));
auto a1 = (in_y - in_y0) * (1 - (in_x - in_x0));
auto a2 = (1 - (in_y - in_y0)) * (in_x - in_x0);
auto a3 = (in_y - in_y0) * (in_x - in_x0);
output[destIdx++] = T(v0 * a0 + v1 * a1 + v2 * a2 + v3 * a3);
inline void softmax(const float *input, float *output, float beta, int32_t outer_size, size_t inner_size)
for (int32_t batch = 0; batch < outer_size; batch++)
auto src = input + batch * inner_size;
auto dest = output + batch * inner_size;
auto max = *std::max_element(src, src + inner_size);
float sum = 0;
for (size_t i = 0; i < inner_size; i++)
auto value = expf((src[i] - max) * beta);
sum += value;
dest[i] = value;
for (size_t i = 0; i < inner_size; i++)
dest[i] /= sum;
template <class T, class TShape>
void transpose(const T *CXX_RESTRICT input, T *CXX_RESTRICT output, const TShape &in_shape, const TShape &in_strides, const TShape &out_strides, const TShape &perm)
runtime_shape_t out_shape(in_shape.size());
for (size_t i = 0; i < in_shape.size(); i++)
out_shape[i] = in_shape[perm[i]];
runtime_shape_t i(4), o(4);
for (o[3] = 0; o[3] < out_shape[3]; o[3]++)
i[perm[3]] = o[3];
for (o[2] = 0; o[2] < out_shape[2]; o[2]++)
i[perm[2]] = o[2];
for (o[1] = 0; o[1] < out_shape[1]; o[1]++)
i[perm[1]] = o[1];
for (o[0] = 0; o[0] < out_shape[0]; o[0]++)
i[perm[0]] = o[0];
output[offset(out_strides, o)] = input[offset(in_strides, i)];
template <class T, class TShape>
void strided_slice(const T *CXX_RESTRICT input, T *CXX_RESTRICT output, const TShape &in_shape, const TShape &begin, const TShape &end, const TShape &strides)
auto loop_cond = [](int32_t i, int32_t stop, int32_t stride) {
return stride > 0 ? i < stop : i > stop;
for (int32_t d0 = begin[0]; loop_cond(d0, end[0], strides[0]); d0 += strides[0])
auto d0_origin = input + (size_t)d0 * in_shape[1] * in_shape[2] * in_shape[3];
for (int d1 = begin[1]; loop_cond(d1, end[1], strides[1]); d1 += strides[1])
auto d1_origin = d0_origin + (size_t)d1 * in_shape[2] * in_shape[3];
for (int32_t d2 = begin[2]; loop_cond(d2, end[2], strides[2]); d2 += strides[2])
auto d2_origin = d1_origin + (size_t)d2 * in_shape[3];
for (int32_t d3 = begin[3]; loop_cond(d3, end[3], strides[3]); d3 += strides[3])
*output++ = d2_origin[d3];
inline void nnil_unary_method(const float *input, float *output, size_t count, gsl::span<const gsl::byte> body)
using namespace nncase::runtime;
for (size_t i = 0; i < count; i++)
nnil_evalstack stack;
span_reader sr(body);
nnil_reader reader(sr);
bool ret = false;
while (reader.avail() && !ret)
auto op =;
switch (op.opcode)
case nnil_nop:
case nnil_dup:
case nnil_pop:
case nnil_lda_0:
case nnil_ldc_r4_0:
case nnil_ldc_r4_1:
case nnil_ldc_r4:
case nnil_abs:
case nnil_ceil:
case nnil_cos:
case nnil_exp:
case nnil_floor:
case nnil_log:
case nnil_neg:
case nnil_rsqrt:
stack.push(1.f / sqrtf(stack.pop()));
case nnil_sin:
case nnil_square:
auto v = stack.pop();
stack.push(v * v);
case nnil_add:
auto b = stack.pop();
auto a = stack.pop();
stack.push(a + b);
case nnil_sub:
auto b = stack.pop();
auto a = stack.pop();
stack.push(a - b);
case nnil_mul:
auto b = stack.pop();
auto a = stack.pop();
stack.push(a * b);
case nnil_div:
auto b = stack.pop();
auto a = stack.pop();
stack.push(a / b);
case nnil_min:
auto b = stack.pop();
auto a = stack.pop();
stack.push(std::min(a, b));
case nnil_max:
auto b = stack.pop();
auto a = stack.pop();
stack.push(std::max(a, b));
case nnil_pow:
auto b = stack.pop();
auto a = stack.pop();
stack.push(std::pow(a, b));
case nnil_clamp:
auto high = stack.pop();
auto low = stack.pop();
auto v = stack.pop();
stack.push(std::clamp(v, low, high));
case nnil_ret:
output[i] = stack.pop();
ret = true;
throw std::runtime_error("Invalid nnil op");
inline void table_lookup1d(const uint8_t *CXX_RESTRICT input, uint8_t *CXX_RESTRICT output, size_t size, const uint8_t *CXX_RESTRICT table)
for (size_t i = 0; i < size; i++)
output[i] = table[input[i]];