Support KModel V4

pull/89/head
sunnycase 2019-07-24 15:24:18 +08:00
parent 49d25c8f4a
commit 6c201580a0
42 changed files with 5141 additions and 832 deletions

1634
.gitignore vendored

File diff suppressed because it is too large Load Diff

View File

@ -22,6 +22,11 @@ ENDIF ()
# definitions in macros
add_definitions(-DCONFIG_LOG_LEVEL=LOG_VERBOSE -DCONFIG_LOG_ENABLE -DCONFIG_LOG_COLORS -DLOG_KERNEL -D__riscv64 -DLV_CONF_INCLUDE_SIMPLE)
# xtl options
add_definitions(-DTCB_SPAN_NO_EXCEPTIONS -DTCB_SPAN_NO_CONTRACT_CHECKING)
# nncase options
add_definitions(-DNNCASE_TARGET=k210)
if (NOT SDK_ROOT)
get_filename_component(_SDK_ROOT ${CMAKE_CURRENT_LIST_DIR} DIRECTORY)
global_set(SDK_ROOT ${_SDK_ROOT})

View File

@ -40,6 +40,7 @@ if (BUILDING_SDK)
-Wno-error=unused-but-set-variable
-Wno-error=unused-variable
-Wno-error=deprecated-declarations
-Wno-multichar
-Wextra
-Werror=frame-larger-than=32768
-Wno-unused-parameter

View File

@ -103,7 +103,7 @@ SECTIONS
{
PROVIDE_HIDDEN (__init_array_start = .);
KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors))
*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors)
PROVIDE_HIDDEN (__init_array_end = .);
} >ram AT>ram :ram_ro

View File

@ -5,8 +5,8 @@
FILE(GLOB_RECURSE LIB_SRC
"${CMAKE_CURRENT_LIST_DIR}/*.h"
"${CMAKE_CURRENT_LIST_DIR}/*.hpp"
"${CMAKE_CURRENT_LIST_DIR}/*.cpp"
"${CMAKE_CURRENT_LIST_DIR}/*.c"
"${CMAKE_CURRENT_LIST_DIR}/*.cpp"
"${CMAKE_CURRENT_LIST_DIR}/*.s"
"${CMAKE_CURRENT_LIST_DIR}/*.S"
)
@ -16,7 +16,8 @@ FILE(GLOB_RECURSE ASSEMBLY_FILES
"${CMAKE_CURRENT_LIST_DIR}/*.S"
)
include_directories(${CMAKE_CURRENT_LIST_DIR}/drivers/include ${CMAKE_CURRENT_LIST_DIR}/bsp/include)
include_directories(${SDK_ROOT}/third_party/xtl/include)
include_directories(${CMAKE_CURRENT_LIST_DIR}/drivers/include ${CMAKE_CURRENT_LIST_DIR}/bsp/include ${CMAKE_CURRENT_LIST_DIR}/nncase/include)
SET_PROPERTY(SOURCE ${ASSEMBLY_FILES} PROPERTY LANGUAGE C)
SET_SOURCE_FILES_PROPERTIES(${ASSEMBLY_FILES} PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp -D __riscv64")

View File

@ -663,18 +663,31 @@ typedef void (*kpu_done_callback_t)(void *userdata);
typedef struct
{
const uint8_t *model_buffer;
uint8_t *main_buffer;
uint32_t output_count;
const kpu_model_output_t *outputs;
const kpu_model_layer_header_t *layer_headers;
const uint8_t *body_start;
uint32_t layers_length;
volatile uint32_t current_layer;
const uint8_t *volatile current_body;
dmac_channel_number_t dma_ch;
kpu_done_callback_t done_callback;
void *userdata;
int is_nncase;
union
{
struct
{
const uint8_t *model_buffer;
uint8_t *main_buffer;
uint32_t output_count;
const kpu_model_output_t *outputs;
const kpu_model_layer_header_t *layer_headers;
const uint8_t *body_start;
uint32_t layers_length;
volatile uint32_t current_layer;
const uint8_t *volatile current_body;
dmac_channel_number_t dma_ch;
kpu_done_callback_t done_callback;
void *userdata;
};
struct
{
void* nncase_ctx;
};
};
} kpu_model_context_t;
typedef struct

View File

@ -10,6 +10,7 @@
#include "dmac.h"
#include "kpu.h"
#include "printf.h"
#include "nncase.h"
#define LAYER_BURST_SIZE 12
@ -1361,6 +1362,7 @@ int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
if(header->version == 3 && header->arch == 0)
{
ctx->is_nncase = 0;
ctx->model_buffer = buffer;
ctx->output_count = header->output_count;
ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t));
@ -1370,6 +1372,9 @@ int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage);
if(!ctx->main_buffer)
return -1;
} else if(header->version == 'KMDL')
{
return nncase_load_kmodel(ctx, buffer);
} else
{
return -1;
@ -1380,6 +1385,9 @@ int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
{
if(ctx->is_nncase)
return nncase_get_output(ctx, index, data, size);
if(index >= ctx->output_count)
return -1;
@ -1391,6 +1399,9 @@ int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, siz
void kpu_model_free(kpu_model_context_t *ctx)
{
if(ctx->is_nncase)
return nncase_model_free(ctx);
free(ctx->main_buffer);
ctx->main_buffer = NULL;
}
@ -1595,6 +1606,9 @@ static void ai_step_not_isr(void *userdata)
int kpu_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
{
if(ctx->is_nncase)
return nncase_run_kmodel(ctx, src, dma_ch, done_callback, userdata);
ctx->dma_ch = dma_ch;
ctx->done_callback = done_callback;
ctx->userdata = userdata;

8
lib/nncase/.clang-format Normal file
View File

@ -0,0 +1,8 @@
---
BasedOnStyle: WebKit
BreakBeforeBraces: Allman
ConstructorInitializerAllOnOneLineOrOnePerLine: 'true'
UseTab: Never
PointerAlignment: Right
...

View File

@ -0,0 +1,97 @@
#pragma once
#include <array>
#include <optional>
#include <stdint.h>
namespace nncase
{
typedef enum _datatype
{
dt_float32,
dt_uint8
} datatype_t;
struct padding
{
int32_t before;
int32_t after;
int32_t sum() const noexcept { return before + after; }
static padding zero() noexcept { return {}; }
};
template <class T>
struct value_range
{
T min;
T max;
};
typedef enum _reduce_op
{
reduce_mean,
reduce_min,
reduce_max
} reduce_op_t;
typedef enum _binary_op
{
binary_add,
binary_sub,
binary_mul,
binary_div
} binary_op_t;
typedef struct _quant_param
{
int32_t zero_point;
float scale;
} quant_param_t;
inline bool operator==(const quant_param_t &lhs, const quant_param_t &rhs) noexcept
{
return lhs.zero_point == rhs.zero_point && lhs.scale == rhs.scale;
}
struct fixed_mul
{
float mul;
int8_t shift;
};
typedef enum _memory_type
{
mem_const,
mem_main,
mem_k210_kpu
} memory_type_t;
using runtime_shape_t = std::array<int, 4>;
using runtime_paddings_t = std::array<padding, 4>;
struct scalar
{
datatype_t type;
std::array<uint8_t, 4> storage;
scalar() = default;
template <class T>
scalar(T &&value) { as<T>() = value; }
template <class T>
T &as() noexcept { return *reinterpret_cast<T *>(storage.data()); }
template <class T>
const T &as() const noexcept { return *reinterpret_cast<const T *>(storage.data()); }
};
struct memory_range
{
memory_type_t memory_type;
datatype_t datatype;
uint32_t start;
uint32_t size;
};
}

View File

@ -0,0 +1,257 @@
#pragma once
#include "../utils.h"
#include <runtime_op_utility.h>
namespace nncase
{
namespace kernels
{
namespace cpu
{
inline void conv2d(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape,
int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
{
const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
for (int batch = 0; batch < in_shape[0]; batch++)
{
auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
for (int oy = 0; oy < out_h; oy++)
{
for (int ox = 0; ox < out_w; ox++)
{
int in_y_origin = (oy * stride_h) - padding_h.before;
int in_x_origin = (ox * stride_w) - padding_w.before;
int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
for (int oc = 0; oc < out_channels; oc++)
{
auto w_oc = weights + (size_t)oc * filter_h * filter_w * in_shape[3];
float value = bias[oc];
for (int ky = filter_y_start; ky < filter_y_end; ky++)
{
for (int kx = filter_xSstart; kx < filter_x_end; kx++)
{
int in_y = in_y_origin + dilation_h * ky;
int in_x = in_x_origin + dilation_w * kx;
auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
auto w_pix = w_oc + ((size_t)ky * filter_w + kx) * in_shape[3];
for (int ic = 0; ic < in_shape[3]; ic++)
value += in_pix[ic] * w_pix[ic];
}
}
*output++ = details::apply_activation(value, fused_activation);
}
}
}
}
}
inline void depthwise_conv2d(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape,
int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
{
const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
for (int batch = 0; batch < in_shape[0]; batch++)
{
auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
for (int oy = 0; oy < out_h; oy++)
{
for (int ox = 0; ox < out_w; ox++)
{
int in_y_origin = (oy * stride_h) - padding_h.before;
int in_x_origin = (ox * stride_w) - padding_w.before;
int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
for (int oc = 0; oc < in_shape[3]; oc++)
{
auto w_oc = weights + (size_t)oc * filter_h * filter_w;
float value = bias[oc];
for (int ky = filter_y_start; ky < filter_y_end; ky++)
{
for (int kx = filter_xSstart; kx < filter_x_end; kx++)
{
int in_y = in_y_origin + dilation_h * ky;
int in_x = in_x_origin + dilation_w * kx;
auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
auto w_pix = w_oc + ((size_t)ky * filter_w + kx);
value += in_pix[oc] * w_pix[0];
}
}
*output++ = details::apply_activation(value, fused_activation);
}
}
}
}
}
template <class TBinaryOp, class TOutputOp>
void reduce_window2d(const float *input, float *output, float init_value, const runtime_shape_t &in_shape,
int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation, TBinaryOp &&binary_op, TOutputOp &&window_op)
{
const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
for (int batch = 0; batch < in_shape[0]; batch++)
{
auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
for (int oy = 0; oy < out_h; oy++)
{
for (int ox = 0; ox < out_w; ox++)
{
int in_y_origin = (oy * stride_h) - padding_h.before;
int in_x_origin = (ox * stride_w) - padding_w.before;
int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
for (int oc = 0; oc < in_shape[3]; oc++)
{
float value = init_value;
int32_t kernel_count = 0;
for (int ky = filter_y_start; ky < filter_y_end; ky++)
{
for (int kx = filter_xSstart; kx < filter_x_end; kx++)
{
int in_y = in_y_origin + dilation_h * ky;
int in_x = in_x_origin + dilation_w * kx;
auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
value = binary_op(value, in_pix[oc]);
kernel_count++;
}
}
*output++ = details::apply_activation(window_op(value, kernel_count), fused_activation);
}
}
}
}
}
inline void quantized_conv2d(const uint8_t *input, uint8_t *output, const uint8_t *weights, const int32_t *bias, const runtime_shape_t &in_shape,
int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
const padding &padding_h, const padding &padding_w, int32_t input_offset, int32_t filter_offset, int32_t output_mul, int32_t output_shift, int32_t output_offset)
{
const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
for (int batch = 0; batch < in_shape[0]; batch++)
{
auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
for (int oy = 0; oy < out_h; oy++)
{
for (int ox = 0; ox < out_w; ox++)
{
int in_y_origin = (oy * stride_h) - padding_h.before;
int in_x_origin = (ox * stride_w) - padding_w.before;
int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
for (int oc = 0; oc < out_channels; oc++)
{
auto w_oc = weights + (size_t)oc * filter_h * filter_w * in_shape[3];
int32_t value = bias[oc];
for (int ky = filter_y_start; ky < filter_y_end; ky++)
{
for (int kx = filter_xSstart; kx < filter_x_end; kx++)
{
int in_y = in_y_origin + dilation_h * ky;
int in_x = in_x_origin + dilation_w * kx;
auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
auto w_pix = w_oc + ((size_t)ky * filter_w + kx) * in_shape[3];
for (int ic = 0; ic < in_shape[3]; ic++)
value += (in_pix[ic] - input_offset) * (w_pix[ic] - filter_offset);
}
}
value = runtime::mul_and_carry_shift(value, output_mul, output_shift) + output_offset;
*output++ = (uint8_t)std::clamp(value, 0, 255);
}
}
}
}
}
inline void quantized_depthwise_conv2d(const uint8_t *input, uint8_t *output, const uint8_t *weights, const int32_t *bias, const runtime_shape_t &in_shape,
int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
const padding &padding_h, const padding &padding_w, int32_t input_offset, int32_t filter_offset, int32_t output_mul, int32_t output_shift, int32_t output_offset)
{
const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
for (int batch = 0; batch < in_shape[0]; batch++)
{
auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
for (int oy = 0; oy < out_h; oy++)
{
for (int ox = 0; ox < out_w; ox++)
{
int in_y_origin = (oy * stride_h) - padding_h.before;
int in_x_origin = (ox * stride_w) - padding_w.before;
int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
for (int oc = 0; oc < in_shape[3]; oc++)
{
auto w_oc = weights + (size_t)oc * filter_h * filter_w;
int32_t value = bias[oc];
for (int ky = filter_y_start; ky < filter_y_end; ky++)
{
for (int kx = filter_xSstart; kx < filter_x_end; kx++)
{
int in_y = in_y_origin + dilation_h * ky;
int in_x = in_x_origin + dilation_w * kx;
auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
auto w_pix = w_oc + ((size_t)ky * filter_w + kx);
value += (in_pix[oc] - input_offset) * (w_pix[0] - filter_offset);
}
}
value = runtime::mul_and_carry_shift(value, output_mul, output_shift) + output_offset;
*output++ = (uint8_t)std::clamp(value, 0, 255);
}
}
}
}
}
}
}
}

View File

@ -0,0 +1,256 @@
#pragma once
#include "../utils.h"
#include <runtime_op_utility.h>
#include <targets/k210/k210_runtime_op_utility.h>
namespace nncase
{
namespace kernels
{
namespace k210
{
inline void kpu_upload(const uint8_t *src, uint8_t *dest, const runtime_shape_t &in_shape)
{
if (in_shape[3] % 64 == 0)
{
std::copy(src, src + kernels::details::compute_size(in_shape), dest);
}
else
{
auto layout = targets::k210::get_kpu_row_layout(in_shape[3]);
auto fmap_size = targets::k210::get_kpu_bytes(in_shape[3], in_shape[2], in_shape[1]);
for (int32_t batch = 0; batch < in_shape[0]; batch++)
{
auto batch_origin = dest + (size_t)batch * fmap_size;
for (int32_t oc = 0; oc < in_shape[1]; oc++)
{
auto channel_origin = batch_origin + (size_t)oc / layout.groups * layout.row_len * in_shape[2] * 64 + (size_t)oc % layout.groups * layout.row_pitch;
for (int32_t y = 0; y < in_shape[2]; y++)
{
auto y_origin = channel_origin + (size_t)y * layout.row_len * 64;
std::copy(src, src + in_shape[3], y_origin);
src += in_shape[3];
}
}
}
}
}
#if NNCASE_TARGET_K210_SIMULATOR
inline void kpu_download(const uint8_t *src, uint8_t *dest, const runtime_shape_t &in_shape)
{
if (in_shape[3] % 64 == 0)
{
std::copy(src, src + kernels::details::compute_size(in_shape), dest);
}
else
{
auto layout = targets::k210::get_kpu_row_layout(in_shape[3]);
auto fmap_size = targets::k210::get_kpu_bytes(in_shape[3], in_shape[2], in_shape[1]);
for (int32_t batch = 0; batch < in_shape[0]; batch++)
{
auto batch_origin = src + (size_t)batch * fmap_size;
for (int32_t oc = 0; oc < in_shape[1]; oc++)
{
auto channel_origin = batch_origin + (size_t)oc / layout.groups * layout.row_len * in_shape[2] * 64 + (size_t)oc % layout.groups * layout.row_pitch;
for (int32_t y = 0; y < in_shape[2]; y++)
{
auto y_origin = channel_origin + (size_t)y * layout.row_len * 64;
for (int32_t x = 0; x < in_shape[3]; x++)
*dest++ = y_origin[x];
}
}
}
}
}
template <bool IsDepthwise, int32_t FilterSize>
void kpu_conv2d(const uint8_t *input, int64_t *workspace, uint8_t *output, const uint8_t *weights, int32_t in_h, int32_t in_w, int32_t in_channels, int32_t out_channels, uint8_t pad_value, int32_t arg_x,
int32_t shift_x, int32_t arg_w, int32_t shift_w, int64_t arg_add, const targets::k210::kpu_batchnorm_segment *batchnorm, const targets::k210::kpu_activation_table_t &activation)
{
const auto channel_size = size_t(in_h) * in_w;
// conv
{
auto out_it = workspace;
const auto pad = FilterSize == 1 ? 0 : 1;
const auto groups = IsDepthwise ? out_channels : 1;
const auto g_ic = IsDepthwise ? 1 : in_channels / groups;
const auto g_oc = IsDepthwise ? 1 : out_channels;
for (int32_t og = 0; og < groups; og++)
{
const uint8_t *w_group_p = weights + (size_t)og * g_oc * g_ic * FilterSize * FilterSize;
for (int32_t oc = 0; oc < g_oc; oc++)
{
const uint8_t *w_oc_p = w_group_p + (size_t)oc * g_ic * FilterSize * FilterSize;
for (int32_t oy = 0; oy < in_h; oy++)
{
for (int32_t ox = 0; ox < in_w; ox++)
{
const int32_t in_y_origin = oy - pad;
const int32_t in_x_origin = ox - pad;
int64_t value = 0;
int64_t sum_x = 0, sum_w = 0;
for (int32_t ic = 0; ic < g_ic; ic++)
{
const uint8_t *in_c_p = input + ((size_t)og * g_ic + ic) * in_h * in_w;
const uint8_t *w_ic_p = w_oc_p + (size_t)ic * FilterSize * FilterSize;
for (int32_t ky = 0; ky < FilterSize; ky++)
{
for (int32_t kx = 0; kx < FilterSize; kx++)
{
const int32_t in_y = in_y_origin + ky;
const int32_t in_x = in_x_origin + kx;
uint8_t x;
if (in_x < 0 || in_x >= in_w
|| in_y < 0 || in_y >= in_h)
x = pad_value;
else
x = in_c_p[in_y * in_w + in_x];
uint8_t w = w_ic_p[ky * FilterSize + kx];
sum_x += x;
sum_w += w;
value += (int32_t)x * w;
}
}
}
*out_it++ = value + (arg_x * sum_x >> shift_x) + (arg_w * sum_w >> shift_w) + arg_add * g_ic;
}
}
}
}
}
// bn act
{
auto src_it = workspace;
auto out_it = output;
for (int32_t oc = 0; oc < out_channels; oc++)
{
const auto &bn = batchnorm[oc];
for (size_t i = 0; i < channel_size; i++)
{
auto value = (*src_it++ * bn.mul >> bn.shift) + bn.add;
auto &seg = *std::find_if(activation.rbegin(), activation.rend(), [value](const targets::k210::kpu_activation_segment &seg) {
return value > seg.start_x;
});
value = runtime::carry_shift((value - seg.start_x) * seg.mul, seg.shift);
*out_it++ = (uint8_t)std::clamp(value, int64_t(0), int64_t(255));
}
}
}
}
inline void kpu_pool2d(const uint8_t *input, uint8_t *output, int32_t in_h, int32_t in_w, int32_t in_channels, targets::k210::kpu_pool_type_t pool_type)
{
using namespace targets::k210;
const auto filter = get_kpu_filter_size(pool_type);
const auto stride = get_kpu_filter_stride(pool_type);
const auto out_h = get_kpu_pool_output_size(in_h, pool_type);
const auto out_w = get_kpu_pool_output_size(in_w, pool_type);
for (int32_t oc = 0; oc < in_channels; oc++)
{
auto in_c_p = input + (size_t)oc * in_h * in_w;
for (int32_t oy = 0; oy < out_h; oy++)
{
for (int32_t ox = 0; ox < out_w; ox++)
{
const int32_t in_y_origin = oy * stride;
const int32_t in_x_origin = ox * stride;
int32_t value = 0;
switch (pool_type)
{
case kpu_pool_bypass:
{
const int32_t in_y = in_y_origin;
const int32_t in_x = in_x_origin;
value = in_c_p[in_y * in_w + in_x];
break;
}
case kpu_pool_max_2_s2:
case kpu_pool_max_2_s1:
case kpu_pool_max_4_s4:
{
for (int32_t ky = 0; ky < filter; ky++)
{
for (int32_t kx = 0; kx < filter; kx++)
{
const int32_t in_y = in_y_origin + ky;
const int32_t in_x = in_x_origin + kx;
int32_t in_v;
if (in_y < 0 || in_y >= in_h || in_x < 0 || in_x >= in_w)
in_v = 0;
else
in_v = in_c_p[in_y * in_w + in_x];
value = std::max(value, in_v);
}
}
break;
}
case kpu_pool_mean_2_s2:
case kpu_pool_mean_2_s1:
case kpu_pool_mean_4_s4:
{
for (int32_t ky = 0; ky < filter; ky++)
{
for (int32_t kx = 0; kx < filter; kx++)
{
const int32_t in_y = std::clamp(in_y_origin + ky, 0, in_h - 1);
const int32_t in_x = std::clamp(in_x_origin + kx, 0, in_w - 1);
const int32_t in_v = in_c_p[in_y * in_w + in_x];
value += in_v;
}
}
value /= filter * filter;
break;
}
case kpu_pool_left_top_2_s2:
case kpu_pool_left_top_4_s4:
case kpu_pool_right_top_2_s2:
{
auto k_off = get_kpu_select_pool_offset(pool_type);
const int32_t in_y = in_y_origin + k_off[0];
const int32_t in_x = in_x_origin + k_off[1];
int32_t in_v;
if (in_y < 0 || in_y >= in_h || in_x < 0 || in_x >= in_w)
in_v = 0;
else
in_v = in_c_p[in_y * in_w + in_x];
value = in_v;
break;
}
}
*output++ = (uint8_t)value;
}
}
}
}
#endif
}
}
}

View File

@ -0,0 +1,422 @@
#pragma once
#include "../utils.h"
#include <cmath>
#include <runtime_op_utility.h>
#include <xtl/xspan.hpp>
namespace nncase
{
namespace kernels
{
namespace neutral
{
template <class TOp>
void binary(const float *input_a, const float *input_b, float *output, const runtime_shape_t &in_a_shape,
const runtime_shape_t &in_b_shape, const runtime_shape_t &out_shape, const value_range<float> &fused_activation, TOp &&op)
{
for (int32_t d0 = 0; d0 < out_shape[0]; d0++)
{
for (int32_t d1 = 0; d1 < out_shape[1]; d1++)
{
for (int32_t d2 = 0; d2 < out_shape[2]; d2++)
{
for (int32_t d3 = 0; d3 < out_shape[3]; d3++)
{
runtime_shape_t in_off = { d0, d1, d2, d3 };
const auto in_a_off = kernels::details::get_reduced_offset(in_off, in_a_shape);
const auto in_b_off = kernels::details::get_reduced_offset(in_off, in_b_shape);
const auto a = input_a[offset(in_a_shape, in_a_off)];
const auto b = input_b[offset(in_b_shape, in_b_off)];
output[offset(out_shape, in_off)] = kernels::details::apply_activation(op(a, b), fused_activation);
}
}
}
}
}
template <class TRange, class TPtrGetter = details::default_ptr_getter<uint8_t, TRange>>
inline void concat(xtl::span<TRange> inputs, uint8_t *output, xtl::span<const int32_t> concat_dims, size_t inner_size, size_t outer_size, TPtrGetter getter = {})
{
for (size_t oc = 0; oc < outer_size; oc++)
{
for (size_t i = 0; i < inputs.size(); i++)
{
auto size = inner_size * concat_dims[i];
auto src = getter(inputs[i]) + oc * size;
std::copy(src, src + size, output);
output += size;
}
}
}
inline void conv2d(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape,
int32_t groups, int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
{
const auto out_h = details::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
const auto out_w = details::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
const auto g_ic = in_shape[1] / groups;
const auto g_oc = out_channels / groups;
for (int32_t batch = 0; batch < in_shape[0]; batch++)
{
const float *in_batch_p = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
for (int32_t og = 0; og < groups; og++)
{
const float *in_group_p = in_batch_p + (size_t)og * g_ic * in_shape[2] * in_shape[3];
const float *w_group_p = weights + (size_t)og * g_oc * g_ic * filter_h * filter_w;
for (int32_t oc = 0; oc < g_oc; oc++)
{
const float *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
for (int32_t oy = 0; oy < out_h; oy++)
{
for (int32_t ox = 0; ox < out_w; ox++)
{
const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
const int32_t filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
float value = bias[oc];
for (int32_t ic = 0; ic < g_ic; ic++)
{
const float *in_c_p = in_group_p + (size_t)ic * in_shape[2] * in_shape[3];
const float *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
{
for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
{
const int32_t in_y = in_y_origin + dilation_h * ky;
const int32_t in_x = in_x_origin + dilation_w * kx;
const float in_v = in_c_p[in_y * in_shape[3] + in_x];
const float w = w_ic_p[ky * filter_w + kx];
value += in_v * w;
}
}
}
*output++ = details::apply_activation(value, fused_activation);
}
}
}
}
}
}
template <class TQ>
void dequantize(const TQ *input, float *output, size_t count, const quant_param_t &param)
{
float div = 1.f / param.scale;
for (size_t i = 0; i < count; i++)
{
output[i] = (input[i] - param.zero_point) * div;
}
}
inline void matmul(const float *input_a, const float *input_b, float *output, const float *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, const value_range<float> &fused_activation)
{
for (size_t oy = 0; oy < a_rows; oy++)
{
for (size_t ox = 0; ox < b_cols; ox++)
{
float value = bias[ox];
for (size_t i = 0; i < a_cols; i++)
{
const auto a = input_a[oy * a_cols + i];
const auto b = input_b[i * b_cols + ox];
value += a * b;
}
output[oy * b_cols + ox] = details::apply_activation(value, fused_activation);
}
}
}
template <class T>
void pad(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_paddings_t &paddings, T pad_value)
{
runtime_shape_t out_shape = { in_shape[0] + paddings[0].sum(),
in_shape[1] + paddings[1].sum(),
in_shape[2] + paddings[2].sum(),
in_shape[3] + paddings[3].sum() };
for (int d0 = 0; d0 < out_shape[0]; d0++)
{
auto d0_origin = -paddings[0].before;
auto in0 = input + ((size_t)d0_origin + d0) * in_shape[1] * in_shape[2] * in_shape[3];
for (int d1 = 0; d1 < out_shape[1]; d1++)
{
auto d1_origin = -paddings[1].before;
auto in1 = in0 + ((size_t)d1_origin + d1) * in_shape[2] * in_shape[3];
for (int d2 = 0; d2 < out_shape[2]; d2++)
{
auto d2_origin = -paddings[2].before;
auto in2 = in1 + ((size_t)d2_origin + d2) * in_shape[3];
for (int d3 = 0; d3 < out_shape[3]; d3++)
{
auto d3_origin = -paddings[3].before;
if (d0 < paddings[0].before || d0 >= out_shape[0] - paddings[0].after
|| d1 < paddings[1].before || d1 >= out_shape[1] - paddings[1].after
|| d2 < paddings[2].before || d2 >= out_shape[2] - paddings[2].after
|| d3 < paddings[3].before || d1 >= out_shape[3] - paddings[3].after)
*output++ = pad_value;
else
*output++ = in2[d3_origin + d3];
}
}
}
}
}
template <class TQ>
void quantize(const float *input, TQ *output, size_t count, const quant_param_t &param)
{
for (size_t i = 0; i < count; i++)
{
int32_t tmp = (int32_t)roundf(input[i] * param.scale + param.zero_point);
output[i] = std::clamp(tmp, (int32_t)std::numeric_limits<TQ>::lowest(), (int32_t)std::numeric_limits<TQ>::max());
}
}
template <class TReducer>
void reduce(const float *input, float *output, float init_value, const runtime_shape_t &in_shape, const runtime_shape_t &reduced_shape, TReducer &&reducer)
{
std::fill(output, output + kernels::details::compute_size(reduced_shape), init_value);
for (int32_t d0 = 0; d0 < in_shape[0]; d0++)
{
for (int32_t d1 = 0; d1 < in_shape[1]; d1++)
{
for (int32_t d2 = 0; d2 < in_shape[2]; d2++)
{
for (int32_t d3 = 0; d3 < in_shape[3]; d3++)
{
runtime_shape_t in_off = { d0, d1, d2, d3 };
auto out_off = kernels::details::get_reduced_offset(in_off, reduced_shape);
const auto a = input[offset(in_shape, in_off)];
auto &b = output[offset(reduced_shape, out_off)];
b = reducer(b, a);
}
}
}
}
}
template <class TOp>
void unary(const float *input, float *output, size_t count, TOp &&op)
{
for (size_t i = 0; i < count; i++)
output[i] = op(input[i]);
}
template <class TBinaryOp, class TOutputOp>
void reduce_window2d(const float *input, float *output, float init_value, const runtime_shape_t &in_shape, int32_t filter_h, int32_t filter_w,
int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, const padding &padding_h, const padding &padding_w,
const value_range<float> &fused_activation, TBinaryOp &&binary_op, TOutputOp &&window_op)
{
const auto out_h = kernels::details::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
const auto out_w = kernels::details::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
runtime_shape_t out_shape { in_shape[0], in_shape[1], out_h, out_w };
for (int32_t batch = 0; batch < in_shape[0]; batch++)
{
for (int32_t oc = 0; oc < in_shape[1]; oc++)
{
for (int32_t oy = 0; oy < out_h; oy++)
{
for (int32_t ox = 0; ox < out_w; ox++)
{
const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
const int32_t filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
float value = init_value;
int32_t kernel_count = 0;
for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
{
for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
{
const int32_t in_y = in_y_origin + dilation_h * ky;
const int32_t in_x = in_x_origin + dilation_w * kx;
const float in_v = input[offset(in_shape, { batch, oc, in_y, in_x })];
value = binary_op(value, in_v);
kernel_count++;
}
}
output[offset(out_shape, { batch, oc, oy, ox })] = kernels::details::apply_activation(window_op(value, kernel_count), fused_activation);
}
}
}
}
}
template <class T>
void resize_nearest_neighbor(const T *input, T *output, const runtime_shape_t &in_shape, int32_t out_h, int32_t out_w)
{
auto height_scale = (float)in_shape[2] / out_h;
auto width_scale = (float)in_shape[3] / out_w;
for (int batch = 0; batch < in_shape[0]; batch++)
{
auto in_batch = input + batch * in_shape[1] * in_shape[2] * in_shape[3];
for (int oc = 0; oc < in_shape[1]; oc++)
{
auto in_c = in_batch + oc * in_shape[2] * in_shape[3];
for (int oy = 0; oy < out_h; oy++)
{
auto in_y = std::min((int32_t)floorf(oy * height_scale), in_shape[2] - 1);
auto in_row = in_c + in_y * in_shape[3];
for (int ox = 0; ox < out_w; ox++)
{
auto in_x = std::min((int32_t)floorf(ox * width_scale), in_shape[3] - 1);
*output++ = in_row[in_x];
}
}
}
}
}
inline void resize_bilinear(const float *input, float *output, const runtime_shape_t &in_shape, int32_t out_h, int32_t out_w, bool align_corners)
{
auto height_scale = (float)in_shape[2] / out_h;
auto width_scale = (float)in_shape[3] / out_w;
if (align_corners && out_h > 1)
height_scale = (float)(in_shape[2] - 1) / (out_h - 1);
if (align_corners && out_w > 1)
width_scale = (float)(in_shape[3] - 1) / (out_w - 1);
auto destIdx = 0;
for (int batch = 0; batch < in_shape[0]; batch++)
{
auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
for (int oc = 0; oc < in_shape[1]; oc++)
{
auto in_c = in_batch + (size_t)oc * in_shape[2] * in_shape[3];
for (int oy = 0; oy < out_h; oy++)
{
auto in_y = oy * height_scale;
auto in_y0 = (int)floorf(in_y);
auto in_y1 = std::min(in_y0 + 1, in_shape[2] - 1);
for (int ox = 0; ox < out_w; ox++)
{
auto in_x = ox * width_scale;
auto in_x0 = (int)floorf(in_x);
auto in_x1 = std::min(in_x0 + 1, in_shape[3] - 1);
auto v0 = in_c[in_y0 * in_shape[3] + in_x0];
auto v1 = in_c[in_y1 * in_shape[3] + in_x0];
auto v2 = in_c[in_y0 * in_shape[3] + in_x1];
auto v3 = in_c[in_y1 * in_shape[3] + in_x1];
auto a0 = (1 - (in_y - in_y0)) * (1 - (in_x - in_x0));
auto a1 = (in_y - in_y0) * (1 - (in_x - in_x0));
auto a2 = (1 - (in_y - in_y0)) * (in_x - in_x0);
auto a3 = (in_y - in_y0) * (in_x - in_x0);
output[destIdx++] = v0 * a0 + v1 * a1 + v2 * a2 + v3 * a3;
}
}
}
}
}
inline void softmax(const float *input, float *output, float beta, int32_t outer_size, size_t inner_size)
{
for (size_t batch = 0; batch < outer_size; batch++)
{
auto src = input + batch * inner_size;
auto dest = output + batch * inner_size;
auto max = *std::max_element(src, src + inner_size);
float sum = 0;
for (size_t i = 0; i < inner_size; i++)
{
auto value = expf((src[i] - max) * beta);
sum += value;
dest[i] = value;
}
for (size_t i = 0; i < inner_size; i++)
dest[i] /= sum;
}
}
template <class T>
void transpose(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &perm)
{
runtime_shape_t out_shape;
for (size_t i = 0; i < 4; i++)
out_shape[i] = in_shape[perm[i]];
runtime_shape_t i, o;
for (o[3] = 0; o[3] < out_shape[3]; o[3]++)
{
i[perm[3]] = o[3];
for (o[2] = 0; o[2] < out_shape[2]; o[2]++)
{
i[perm[2]] = o[2];
for (o[1] = 0; o[1] < out_shape[1]; o[1]++)
{
i[perm[1]] = o[1];
for (o[0] = 0; o[0] < out_shape[0]; o[0]++)
{
i[perm[0]] = o[0];
output[offset(out_shape, o)] = input[offset(in_shape, i)];
}
}
}
}
}
template <class T>
void strided_slice(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &begin, const runtime_shape_t &end, const runtime_shape_t &strides)
{
auto loop_cond = [](int32_t i, int32_t stop, int32_t stride) {
return stride > 0 ? i < stop : i > stop;
};
for (int32_t d0 = begin[0]; loop_cond(d0, end[0], strides[0]); d0 += strides[0])
{
auto d0_origin = input + (size_t)d0 * in_shape[1] * in_shape[2] * in_shape[3];
for (int d1 = begin[1]; loop_cond(d1, end[1], strides[1]); d1 += strides[1])
{
auto d1_origin = d0_origin + (size_t)d1 * in_shape[2] * in_shape[3];
for (int32_t d2 = begin[2]; loop_cond(d2, end[2], strides[2]); d2 += strides[2])
{
auto d2_origin = d1_origin + (size_t)d2 * in_shape[3];
for (int32_t d3 = begin[3]; loop_cond(d3, end[3], strides[3]); d3 += strides[3])
*output++ = d2_origin[d3];
}
}
}
}
}
}
}

View File

@ -0,0 +1,82 @@
#pragma once
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <datatypes.h>
namespace nncase
{
namespace kernels
{
inline size_t offset(const runtime_shape_t &shape, const runtime_shape_t &index)
{
return (((size_t)index[0] * shape[1] + index[1]) * shape[2] + index[2]) * shape[3] + index[3];
}
namespace details
{
inline int32_t get_windowed_output_size(int32_t size, int32_t filter, int32_t stride, int32_t dilation, const padding &padding)
{
auto effective_filter_size = (filter - 1) * dilation + 1;
return (size + padding.before + padding.after - effective_filter_size + stride) / stride;
}
inline size_t compute_size(const runtime_shape_t &shape)
{
return size_t(shape[0]) * shape[1] * shape[2] * shape[3];
}
template <class T>
inline T apply_activation(T value, value_range<T> activation)
{
return std::clamp(value, activation.min, activation.max);
}
inline runtime_shape_t get_reduced_offset(const runtime_shape_t &in_offset, const runtime_shape_t &reduced_shape)
{
runtime_shape_t off;
for (size_t i = 0; i < in_offset.size(); i++)
{
if (in_offset[i] >= reduced_shape[i])
off[i] = 0;
else
off[i] = in_offset[i];
}
return off;
}
template <class T, class TRange>
struct default_ptr_getter
{
T *operator()(const TRange &range) const noexcept { return range; }
};
template <int32_t Bits>
int32_t to_signed(uint32_t value)
{
auto mask = uint32_t(1) << (Bits - 1);
if (Bits != 32 && (value & mask) != 0)
{
auto sign = 0xFFFFFFFF << Bits;
return (int)(value | sign);
}
return (int32_t)value;
}
template <int32_t Bits>
int64_t to_signed(uint64_t value)
{
auto mask = uint64_t(1) << (Bits - 1);
if ((value & mask) != 0)
{
auto sign = 0xFFFFFFFFFFFFFFFF << Bits;
return (int64_t)(value | sign);
}
return (int64_t)value;
}
}
}
}

View File

@ -0,0 +1,33 @@
/* Copyright 2018 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _NNCASE_H
#define _NNCASE_H
#include "kpu.h"
#ifdef __cplusplus
extern "C" {
#endif
int nncase_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer);
int nncase_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size);
void nncase_model_free(kpu_model_context_t *ctx);
int nncase_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,51 @@
#pragma once
#include <iostream>
#include <xtl/xspan.hpp>
namespace nncase
{
namespace runtime
{
class binary_writer
{
public:
binary_writer(std::ostream &stream)
: stream_(stream)
{
}
template <class T>
void write(T &&value)
{
stream_.write(reinterpret_cast<const char *>(&value), sizeof(value));
}
template <class T>
void write_array(xtl::span<const T> value)
{
stream_.write(reinterpret_cast<const char *>(value.data()), value.size_bytes());
}
std::streampos position() const
{
return stream_.tellp();
}
void position(std::streampos pos)
{
stream_.seekp(pos);
}
void align_position(size_t alignment)
{
auto pos = position();
auto rem = pos % alignment;
if (rem != 0)
position(pos + std::streamoff(alignment - rem));
}
private:
std::ostream &stream_;
};
}
}

View File

@ -0,0 +1,71 @@
#pragma once
#include "model.h"
#include <chrono>
#include <memory>
#include <optional>
#include <xtl/xspan.hpp>
namespace nncase
{
namespace runtime
{
class interpreter_base;
typedef void (*run_callback_t)(void *userdata);
typedef void (*error_callback_t)(const char *err, void *userdata);
typedef void (*node_profile_callback_t)(runtime_opcode op, std::chrono::nanoseconds duration, void *userdata);
typedef void (interpreter_base::*interpreter_step_t)();
class interpreter_base
{
using clock_t = std::chrono::system_clock;
public:
bool try_load_model(const uint8_t *buffer);
size_t inputs_size() const noexcept { return model_header_->inputs; }
size_t outputs_size() const noexcept { return model_header_->outputs; }
size_t nodes_size() const noexcept { return model_header_->nodes; }
const runtime_shape_t &input_shape_at(size_t index) const noexcept { return input_shapes_.at(index); }
const memory_range &input_at(size_t index) const noexcept { return inputs_[index]; }
const memory_range &output_at(size_t index) const noexcept { return outputs_[index]; }
template <class T>
xtl::span<T> memory_at(const memory_range &range) const noexcept
{
auto span = memory_at(range);
return { reinterpret_cast<T *>(span.data()), span.size() / sizeof(T) };
}
std::chrono::nanoseconds total_duration() const noexcept { return total_duration_; }
void run(run_callback_t callback, error_callback_t on_error, node_profile_callback_t node_profile, void *userdata);
protected:
virtual bool initialize();
virtual xtl::span<uint8_t> memory_at(const memory_range &range) const noexcept;
private:
void step();
private:
const model_header *model_header_;
std::unique_ptr<uint8_t[]> main_mem_;
xtl::span<const memory_range> inputs_;
xtl::span<const memory_range> outputs_;
xtl::span<const runtime_shape_t> input_shapes_;
xtl::span<const node_header> node_headers_;
xtl::span<const uint8_t> constants_;
const uint8_t *node_body_start_;
error_callback_t on_error_;
run_callback_t run_callback_;
node_profile_callback_t node_profile_;
void *userdata_;
size_t cnt_node_;
const uint8_t *cnt_node_body_;
std::chrono::nanoseconds total_duration_;
std::optional<clock_t::time_point> last_time_;
runtime_opcode last_op_;
};
}
}

View File

@ -0,0 +1,20 @@
#pragma once
#include "target_config.h"
#include <datatypes.h>
#include <runtime/runtime_op.h>
#include <xtl/xspan.hpp>
namespace nncase
{
namespace runtime
{
enum kernel_call_result
{
kcr_done,
kcr_async,
kcr_error
};
kernel_call_result call_kernel(runtime_opcode opcode, xtl::span<const uint8_t> body, interpreter_t &interpreter, interpreter_step_t step);
}
}

View File

@ -0,0 +1,38 @@
#pragma once
#include "../datatypes.h"
#include "runtime_op.h"
namespace nncase
{
namespace runtime
{
enum model_target : uint32_t
{
MODEL_TARGET_CPU = 0,
MODEL_TARGET_K210 = 1,
};
struct model_header
{
uint32_t identifier;
uint32_t version;
uint32_t flags;
model_target target;
uint32_t constants;
uint32_t main_mem;
uint32_t nodes;
uint32_t inputs;
uint32_t outputs;
uint32_t reserved0;
};
constexpr uint32_t MODEL_IDENTIFIER = 'KMDL';
constexpr uint32_t MODEL_VERSION = 4;
struct node_header
{
runtime_opcode opcode;
uint32_t body_size;
};
}
}

View File

@ -0,0 +1,32 @@
BEGINE_DEFINE_TARGET(neutral)
DEFINE_RUNTIME_OP(neutral, binary, Binary, 0)
DEFINE_RUNTIME_OP(neutral, concat, Concat, 1)
DEFINE_RUNTIME_OP(neutral, conv2d, Conv2D, 2)
DEFINE_RUNTIME_OP(neutral, dequantize, Dequantize, 3)
DEFINE_RUNTIME_OP(neutral, matmul, MatMul, 4)
DEFINE_RUNTIME_OP(neutral, pad, Pad, 5)
DEFINE_RUNTIME_OP(neutral, quantize, Quantize, 6)
DEFINE_RUNTIME_OP(neutral, reduce, Reduce, 7)
DEFINE_RUNTIME_OP(neutral, reduce_window2d, ReduceWindow2D, 8)
DEFINE_RUNTIME_OP(neutral, memory_copy, MemoryCopy, 9)
DEFINE_RUNTIME_OP(neutral, resize_bilinear, ResizeBilinear, 10)
DEFINE_RUNTIME_OP(neutral, resize_nearest_neighbor, ResizeNearestNeighbor, 11)
DEFINE_RUNTIME_OP(neutral, softmax, Softmax, 12)
DEFINE_RUNTIME_OP(neutral, transpose, Transpose, 13)
DEFINE_RUNTIME_OP(neutral, strided_slice, StridedSlice, 14)
END_DEFINE_TARGET()
// CPU
BEGINE_DEFINE_TARGET(cpu)
DEFINE_RUNTIME_OP(cpu, cpu_conv2d, CPU_CPUConv2D, 1001)
DEFINE_RUNTIME_OP(cpu, cpu_depthwise_conv2d, CPU_CPUDepthwiseConv2D, 1002)
DEFINE_RUNTIME_OP(cpu, cpu_reduce_window2d, CPU_CPUReduceWindow2D, 1003)
DEFINE_RUNTIME_OP(cpu, cpu_quantized_conv2d, CPU_CPUQuantizedConv2D, 1004)
DEFINE_RUNTIME_OP(cpu, cpu_quantized_depthwise_conv2d, CPU_CPUQuantizedDepthwiseConv2D, 1005)
END_DEFINE_TARGET()
// K210
BEGINE_DEFINE_TARGET(k210)
DEFINE_RUNTIME_OP(k210, kpu_upload, K210_KPUUpload, 2001)
DEFINE_RUNTIME_OP(k210, kpu_conv2d, K210_KPUConv2D, 2002)
END_DEFINE_TARGET()

View File

@ -0,0 +1,37 @@
#pragma once
#include "../datatypes.h"
#include <string_view>
namespace nncase
{
namespace runtime
{
#define BEGINE_DEFINE_TARGET(...)
#define DEFINE_RUNTIME_OP(target, id, name, value) rop_##id = value,
#define END_DEFINE_TARGET()
enum runtime_opcode : uint32_t
{
#include "runtime_op.def"
};
#undef DEFINE_RUNTIME_OP
#define DEFINE_RUNTIME_OP(target, id, name, value) \
case rop_##id: \
return #name;
constexpr std::string_view node_opcode_names(runtime_opcode opcode)
{
switch (opcode)
{
#include "runtime_op.def"
default:
return {};
}
}
#undef BEGINE_DEFINE_TARGET
#undef DEFINE_RUNTIME_OP
#undef END_DEFINE_TARGET
}
}

View File

@ -0,0 +1,82 @@
#pragma once
#include <xtl/xspan.hpp>
namespace nncase
{
namespace runtime
{
class span_reader
{
public:
span_reader(xtl::span<const uint8_t> span)
: span_(span)
{
}
bool empty() const noexcept { return span_.empty(); }
template <class T>
T read()
{
auto value = *reinterpret_cast<const T *>(span_.data());
advance(sizeof(T));
return value;
}
template <class T>
void read(T &value)
{
value = *reinterpret_cast<const T *>(span_.data());
advance(sizeof(T));
}
template <class T>
void read_span(xtl::span<const T> &span, size_t size)
{
span = { reinterpret_cast<const T *>(span_.data()), size };
advance(sizeof(T) * size);
}
template <class T, ptrdiff_t N>
void read_span(xtl::span<const T, N> &span)
{
span = { reinterpret_cast<const T *>(span_.data()), N };
advance(sizeof(T) * N);
}
template <class T>
const T *peek() const noexcept
{
return reinterpret_cast<const T *>(span_.data());
}
template <class T>
void get_array(const T *&value, size_t size)
{
value = peek<T>();
advance(size * sizeof(T));
}
template <class T>
void get_ref(const T *&value)
{
value = peek<T>();
advance(sizeof(T));
}
void skip(size_t count)
{
advance(count);
}
private:
void advance(size_t count)
{
span_ = span_.subspan(count);
}
private:
xtl::span<const uint8_t> span_;
};
}
}

View File

@ -0,0 +1,15 @@
#pragma once
#define NNCASE_CONCAT_3(a, b, c) a/b/c
#define NNCASE_TARGET_HEADER_(target, name) <NNCASE_CONCAT_3(targets, target, name)>
#define NNCASE_TARGET_HEADER(name) NNCASE_TARGET_HEADER_(NNCASE_TARGET, name)
#include NNCASE_TARGET_HEADER(interpreter.h)
namespace nncase
{
namespace runtime
{
using interpreter_t = nncase::targets::NNCASE_TARGET::interpreter;
}
}

View File

@ -0,0 +1,70 @@
#pragma once
#include <cassert>
#include <datatypes.h>
namespace nncase
{
namespace runtime
{
inline size_t get_bytes(datatype_t type)
{
size_t element_size;
switch (type)
{
case dt_float32:
element_size = 4;
break;
case dt_uint8:
element_size = 1;
break;
default:
assert(!"Not supported data type");
}
return element_size;
}
template <int32_t Bits, class T>
uint8_t count_leading_zeros(T value)
{
uint8_t num_zeroes = 0;
for (int32_t i = Bits - 1; i >= 0; i--)
{
if ((value & (1ULL << i)) == 0)
++num_zeroes;
else
break;
}
return num_zeroes;
}
template <class T>
T carry_shift(T value, uint8_t shift)
{
if (shift > 0)
{
value >>= shift - 1;
if (value & 0x1)
{
if (value < 0)
value = (value >> 1) - 1;
else
value = (value >> 1) + 1;
}
else
{
value >>= 1;
}
}
return value;
}
inline int32_t mul_and_carry_shift(int32_t value, int32_t mul, uint8_t shift)
{
return (int32_t)carry_shift((int64_t) value * mul, shift);
}
}
}

View File

@ -0,0 +1,193 @@
#pragma once
#include "../node_body.h"
namespace nncase
{
namespace targets
{
namespace cpu
{
struct cpu_conv2d_options
{
memory_range input;
memory_range output;
runtime_shape_t in_shape;
int32_t out_channels;
padding padding_h;
padding padding_w;
int32_t filter_h;
int32_t filter_w;
int32_t stride_h;
int32_t stride_w;
int32_t dilation_h;
int32_t dilation_w;
value_range<float> fused_activation;
xtl::span<const float> weights;
xtl::span<const float> bias;
void deserialize(runtime::span_reader &reader)
{
reader.read(input);
reader.read(output);
reader.read(in_shape);
reader.read(out_channels);
reader.read(padding_h);
reader.read(padding_w);
reader.read(filter_h);
reader.read(filter_w);
reader.read(stride_h);
reader.read(stride_w);
reader.read(dilation_h);
reader.read(dilation_w);
reader.read(fused_activation);
reader.read_span(weights, (size_t)out_channels * in_shape[3] * filter_h * filter_w);
reader.read_span(bias, out_channels);
}
};
struct cpu_depthwise_conv2d_options
{
memory_range input;
memory_range output;
runtime_shape_t in_shape;
padding padding_h;
padding padding_w;
int32_t filter_h;
int32_t filter_w;
int32_t stride_h;
int32_t stride_w;
int32_t dilation_h;
int32_t dilation_w;
value_range<float> fused_activation;
xtl::span<const float> weights;
xtl::span<const float> bias;
void deserialize(runtime::span_reader &reader)
{
reader.read(input);
reader.read(output);
reader.read(in_shape);
reader.read(padding_h);
reader.read(padding_w);
reader.read(filter_h);
reader.read(filter_w);
reader.read(stride_h);
reader.read(stride_w);
reader.read(dilation_h);
reader.read(dilation_w);
reader.read(fused_activation);
reader.read_span(weights, (size_t)in_shape[3] * filter_h * filter_w);
reader.read_span(bias, in_shape[3]);
}
};
struct cpu_reduce_window2d_options : simple_node_body<cpu_reduce_window2d_options>
{
memory_range input;
memory_range output;
reduce_op_t reduce_op;
runtime_shape_t in_shape;
padding padding_h;
padding padding_w;
int32_t filter_h;
int32_t filter_w;
int32_t stride_h;
int32_t stride_w;
int32_t dilation_h;
int32_t dilation_w;
float init_value;
value_range<float> fused_activation;
};
struct cpu_quantized_conv2d_options
{
memory_range input;
memory_range output;
runtime_shape_t in_shape;
int32_t out_channels;
padding padding_h;
padding padding_w;
int32_t filter_h;
int32_t filter_w;
int32_t stride_h;
int32_t stride_w;
int32_t dilation_h;
int32_t dilation_w;
int32_t input_offset;
int32_t filter_offset;
int32_t output_mul;
int32_t output_shift;
int32_t output_offset;
xtl::span<const uint8_t> weights;
xtl::span<const int32_t> bias;
void deserialize(runtime::span_reader &reader)
{
reader.read(input);
reader.read(output);
reader.read(in_shape);
reader.read(out_channels);
reader.read(padding_h);
reader.read(padding_w);
reader.read(filter_h);
reader.read(filter_w);
reader.read(stride_h);
reader.read(stride_w);
reader.read(dilation_h);
reader.read(dilation_w);
reader.read(input_offset);
reader.read(filter_offset);
reader.read(output_mul);
reader.read(output_shift);
reader.read(output_offset);
reader.read_span(weights, (size_t)out_channels * in_shape[3] * filter_h * filter_w);
reader.read_span(bias, out_channels);
}
};
struct cpu_quantized_depthwise_conv2d_options
{
memory_range input;
memory_range output;
runtime_shape_t in_shape;
padding padding_h;
padding padding_w;
int32_t filter_h;
int32_t filter_w;
int32_t stride_h;
int32_t stride_w;
int32_t dilation_h;
int32_t dilation_w;
int32_t input_offset;
int32_t filter_offset;
int32_t output_mul;
int32_t output_shift;
int32_t output_offset;
xtl::span<const uint8_t> weights;
xtl::span<const int32_t> bias;
void deserialize(runtime::span_reader &reader)
{
reader.read(input);
reader.read(output);
reader.read(in_shape);
reader.read(padding_h);
reader.read(padding_w);
reader.read(filter_h);
reader.read(filter_w);
reader.read(stride_h);
reader.read(stride_w);
reader.read(dilation_h);
reader.read(dilation_w);
reader.read(input_offset);
reader.read(filter_offset);
reader.read(output_mul);
reader.read(output_shift);
reader.read(output_offset);
reader.read_span(weights, (size_t)in_shape[3] * filter_h * filter_w);
reader.read_span(bias, in_shape[3]);
}
};
}
}
}

View File

@ -0,0 +1,17 @@
#pragma once
#include <runtime/interpreter.h>
namespace nncase
{
namespace targets
{
namespace cpu
{
class interpreter : public runtime::interpreter_base
{
public:
using interpreter_base::interpreter_base;
};
}
}
}

View File

@ -0,0 +1,44 @@
#pragma once
#include "k210_sim_types.h"
#include <runtime/interpreter.h>
namespace nncase
{
namespace targets
{
namespace k210
{
struct k210_interpreter_context
{
runtime::interpreter_base *interpreter;
runtime::interpreter_step_t step;
};
class interpreter : public runtime::interpreter_base
{
public:
using interpreter_base::memory_at;
interpreter();
#if !NNCASE_TARGET_K210_SIMULATOR
dmac_channel_number_t dma_ch() const noexcept { return dma_ch_; }
void dma_ch(dmac_channel_number_t dma_ch) noexcept { dma_ch_ = dma_ch; }
k210_interpreter_context &context() noexcept { return context_; }
#endif
protected:
xtl::span<uint8_t> memory_at(const memory_range &range) const noexcept override;
private:
#if NNCASE_TARGET_K210_SIMULATOR
std::unique_ptr<uint8_t[]> kpu_mem_;
#else
dmac_channel_number_t dma_ch_;
k210_interpreter_context context_;
#endif
};
}
}
}

View File

@ -0,0 +1,58 @@
#pragma once
#include "../node_body.h"
#include "k210_runtime_op_utility.h"
#include "k210_sim_types.h"
namespace nncase
{
namespace targets
{
namespace k210
{
struct kpu_upload_options : simple_node_body<kpu_upload_options>
{
memory_range input;
memory_range output;
runtime_shape_t in_shape;
};
struct kpu_conv2d_options
{
memory_range main_mem_output;
int32_t batches;
int32_t reserved0;
kpu_layer_argument_t layer;
xtl::span<const kpu_batchnorm_argument_t> batch_norm;
const kpu_activate_table_t *activation;
xtl::span<const uint8_t> weights;
void deserialize(runtime::span_reader &reader)
{
reader.read(main_mem_output);
reader.read(batches);
reader.read(reserved0);
reader.read(layer);
auto ic = layer.image_channel_num.data.i_ch_num + 1;
auto oc = layer.image_channel_num.data.o_ch_num + 1;
auto filter = get_kpu_filter_size((kpu_filter_type_t)layer.kernel_pool_type_cfg.data.kernel_type);
auto weights_size = layer.interrupt_enabe.data.depth_wise_layer
? oc * filter * filter
: ic * oc * filter * filter;
reader.skip(layer.kernel_pool_type_cfg.data.bwsx_base_addr);
reader.read_span(batch_norm, oc);
reader.skip(layer.kernel_calc_type_cfg.data.active_addr);
reader.get_ref(activation);
reader.skip(layer.kernel_load_cfg.data.para_start_addr);
reader.read_span(weights, weights_size);
#if !NNCASE_TARGET_K210_SIMULATOR
layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)batch_norm.data();
layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)activation;
layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)weights.data();
#endif
}
};
}
}
}

View File

@ -0,0 +1,134 @@
#pragma once
#include "k210_sim_types.h"
namespace nncase
{
namespace targets
{
namespace k210
{
struct kpu_layout
{
int32_t groups;
int32_t row_len;
int32_t row_pitch;
};
inline kpu_layout get_kpu_row_layout(int32_t width)
{
kpu_layout layout;
if (width <= 16)
{
layout.groups = 4;
layout.row_len = 1;
layout.row_pitch = 16;
}
else if (width <= 32)
{
layout.groups = 2;
layout.row_len = 1;
layout.row_pitch = 32;
}
else
{
layout.groups = 1;
layout.row_len = (width + 63) / 64;
layout.row_pitch = 64;
}
return layout;
}
inline int32_t get_kpu_filter_size(kpu_filter_type_t filter)
{
switch (filter)
{
case kpu_filter_1x1:
return 1;
case kpu_filter_3x3:
return 3;
default:
return 0;
}
}
inline int get_kpu_rows(int32_t width, int32_t height, int32_t channels)
{
auto layout = get_kpu_row_layout(width);
auto one_line_channels = std::min(channels, layout.groups);
auto blocks = (channels + one_line_channels - 1) / one_line_channels;
auto size = layout.row_len * height * blocks;
return size;
}
inline int get_kpu_bytes(int32_t width, int32_t height, int32_t channels)
{
return get_kpu_rows(width, height, channels) * 64;
}
#if NNCASE_TARGET_K210_SIMULATOR
inline int32_t get_kpu_filter_size(kpu_pool_type_t filter)
{
switch (filter)
{
case kpu_pool_bypass:
return 1;
case kpu_pool_max_2_s2:
case kpu_pool_mean_2_s2:
case kpu_pool_left_top_2_s2:
case kpu_pool_right_top_2_s2:
case kpu_pool_max_2_s1:
case kpu_pool_mean_2_s1:
return 2;
case kpu_pool_max_4_s4:
case kpu_pool_mean_4_s4:
case kpu_pool_left_top_4_s4:
return 4;
}
}
inline int32_t get_kpu_filter_stride(kpu_pool_type_t filter)
{
switch (filter)
{
case kpu_pool_bypass:
return 1;
case kpu_pool_max_2_s2:
case kpu_pool_mean_2_s2:
case kpu_pool_left_top_2_s2:
case kpu_pool_right_top_2_s2:
return 2;
case kpu_pool_max_2_s1:
case kpu_pool_mean_2_s1:
return 1;
case kpu_pool_max_4_s4:
case kpu_pool_mean_4_s4:
case kpu_pool_left_top_4_s4:
return 4;
}
}
inline int32_t get_kpu_pool_output_size(int32_t input, kpu_pool_type_t pool_type)
{
return input / get_kpu_filter_stride(pool_type);
}
inline std::array<int32_t, 2> get_kpu_select_pool_offset(kpu_pool_type_t pool_type)
{
switch (pool_type)
{
case kpu_pool_left_top_2_s2:
return { 0, 0 };
case kpu_pool_right_top_2_s2:
return { 0, 1 };
case kpu_pool_left_top_4_s4:
return { 0, 0 };
}
}
#endif
}
}
}

View File

@ -0,0 +1,249 @@
#pragma once
#include <array>
#include <cstdint>
#ifdef __riscv64
#define NNCASE_TARGET_K210_SIMULATOR 0
#include <kpu.h>
#else
#define NNCASE_TARGET_K210_SIMULATOR 1
#endif
namespace nncase
{
namespace targets
{
namespace k210
{
#if NNCASE_TARGET_K210_SIMULATOR
typedef struct
{
union {
uint64_t reg;
struct
{
uint64_t int_en : 1;
uint64_t ram_flag : 1;
uint64_t full_add : 1;
uint64_t depth_wise_layer : 1;
uint64_t reserved : 60;
} data;
} interrupt_enabe;
union {
uint64_t reg;
struct
{
uint64_t image_src_addr : 15;
uint64_t reserved0 : 17;
uint64_t image_dst_addr : 15;
uint64_t reserved1 : 17;
} data;
} image_addr;
union {
uint64_t reg;
struct
{
uint64_t i_ch_num : 10;
uint64_t reserved0 : 22;
uint64_t o_ch_num : 10;
uint64_t reserved1 : 6;
uint64_t o_ch_num_coef : 10;
uint64_t reserved2 : 6;
} data;
} image_channel_num;
union {
uint64_t reg;
struct
{
uint64_t i_row_wid : 10;
uint64_t i_col_high : 9;
uint64_t reserved0 : 13;
uint64_t o_row_wid : 10;
uint64_t o_col_high : 9;
uint64_t reserved1 : 13;
} data;
} image_size;
union {
uint64_t reg;
struct
{
uint64_t kernel_type : 3;
uint64_t pad_type : 1;
uint64_t pool_type : 4;
uint64_t first_stride : 1;
uint64_t bypass_conv : 1;
uint64_t load_para : 1;
uint64_t reserved0 : 5;
uint64_t dma_burst_size : 8;
uint64_t pad_value : 8;
uint64_t bwsx_base_addr : 32;
} data;
} kernel_pool_type_cfg;
union {
uint64_t reg;
struct
{
uint64_t load_coor : 1;
uint64_t load_time : 6;
uint64_t reserved0 : 8;
uint64_t para_size : 17;
uint64_t para_start_addr : 32;
} data;
} kernel_load_cfg;
union {
uint64_t reg;
struct
{
uint64_t coef_column_offset : 4;
uint64_t coef_row_offset : 12;
uint64_t reserved0 : 48;
} data;
} kernel_offset;
union {
uint64_t reg;
struct
{
uint64_t channel_switch_addr : 15;
uint64_t reserved : 1;
uint64_t row_switch_addr : 4;
uint64_t coef_size : 8;
uint64_t coef_group : 3;
uint64_t load_act : 1;
uint64_t active_addr : 32;
} data;
} kernel_calc_type_cfg;
union {
uint64_t reg;
struct
{
uint64_t wb_channel_switch_addr : 15;
uint64_t reserved0 : 1;
uint64_t wb_row_switch_addr : 4;
uint64_t wb_group : 3;
uint64_t reserved1 : 41;
} data;
} write_back_cfg;
union {
uint64_t reg;
struct
{
uint64_t shr_w : 4;
uint64_t shr_x : 4;
uint64_t arg_w : 24;
uint64_t arg_x : 24;
uint64_t reserved0 : 8;
} data;
} conv_value;
union {
uint64_t reg;
struct
{
uint64_t arg_add : 40;
uint64_t reserved : 24;
} data;
} conv_value2;
union {
uint64_t reg;
struct
{
uint64_t send_data_out : 1;
uint64_t reserved : 15;
uint64_t channel_byte_num : 16;
uint64_t dma_total_byte : 32;
} data;
} dma_parameter;
} kpu_layer_argument_t;
typedef struct
{
union {
uint64_t reg;
struct
{
uint64_t shift_number : 8;
uint64_t y_mul : 16;
uint64_t x_start : 36;
} data;
} activate_para[16];
union {
uint64_t reg;
struct
{
uint8_t result_bias[8];
} data;
} activate_para_bias0;
union {
uint64_t reg;
struct
{
uint8_t result_bias[8];
} data;
} activate_para_bias1;
} kpu_activate_table_t;
#endif
typedef struct
{
union {
uint64_t reg;
struct
{
uint64_t norm_mul : 24;
uint64_t norm_add : 32;
uint64_t norm_shift : 4;
} data;
} batchnorm;
} kpu_batchnorm_argument_t;
typedef enum _kpu_filter_type
{
kpu_filter_1x1 = 0,
kpu_filter_3x3 = 1
} kpu_filter_type_t;
typedef enum _kpu_pool_type
{
kpu_pool_bypass = 0,
kpu_pool_max_2_s2 = 1,
kpu_pool_mean_2_s2 = 2,
kpu_pool_max_4_s4 = 3,
kpu_pool_mean_4_s4 = 4,
kpu_pool_left_top_2_s2 = 5,
kpu_pool_right_top_2_s2 = 6,
kpu_pool_left_top_4_s4 = 7,
kpu_pool_mean_2_s1 = 8,
kpu_pool_max_2_s1 = 9
} kpu_pool_type_t;
struct kpu_batchnorm_segment
{
int32_t mul;
int32_t shift;
int32_t add;
};
struct kpu_activation_segment
{
int64_t start_x;
int32_t mul;
int32_t shift;
int32_t add;
};
using kpu_activation_table_t = std::array<kpu_activation_segment, 16>;
}
}
}

View File

@ -0,0 +1,258 @@
#pragma once
#include "../node_body.h"
namespace nncase
{
namespace targets
{
namespace neutral
{
struct binary_options : public simple_node_body<binary_options>
{
memory_range input_a;
memory_range input_b;
memory_range output;
binary_op_t binary_op;
runtime_shape_t in_a_shape;
runtime_shape_t in_b_shape;
runtime_shape_t out_shape;
value_range<float> fused_activation;
};
struct concat_options
{
memory_range output;
uint32_t inner_size;
uint32_t outer_size;
uint32_t inputs_count;
xtl::span<const memory_range> inputs;
xtl::span<const int32_t> dims;
void deserialize(runtime::span_reader &reader)
{
reader.read(output);
reader.read(inner_size);
reader.read(outer_size);
reader.read(inputs_count);
reader.read_span(inputs, inputs_count);
reader.read_span(dims, inputs_count);
}
void serialize(runtime::binary_writer &writer) const
{
writer.write(output);
writer.write(inner_size);
writer.write(outer_size);
writer.write(inputs_count);
writer.write_array(inputs);
writer.write_array(dims);
}
};
struct conv2d_options
{
memory_range input;
memory_range output;
runtime_shape_t in_shape;
int32_t groups;
int32_t out_channels;
padding padding_h;
padding padding_w;
int32_t filter_h;
int32_t filter_w;
int32_t stride_h;
int32_t stride_w;
int32_t dilation_h;
int32_t dilation_w;
value_range<float> fused_activation;
xtl::span<const float> weights;
xtl::span<const float> bias;
void deserialize(runtime::span_reader &reader)
{
reader.read(input);
reader.read(output);
reader.read(in_shape);
reader.read(groups);
reader.read(out_channels);
reader.read(padding_h);
reader.read(padding_w);
reader.read(filter_h);
reader.read(filter_w);
reader.read(stride_h);
reader.read(stride_w);
reader.read(dilation_h);
reader.read(dilation_w);
reader.read(fused_activation);
reader.read_span(weights, (size_t)out_channels * in_shape[1] / groups * filter_h * filter_w);
reader.read_span(bias, out_channels);
}
void serialize(runtime::binary_writer &writer) const
{
writer.write(input);
writer.write(output);
writer.write(in_shape);
writer.write(groups);
writer.write(out_channels);
writer.write(padding_h);
writer.write(padding_w);
writer.write(filter_h);
writer.write(filter_w);
writer.write(stride_h);
writer.write(stride_w);
writer.write(dilation_h);
writer.write(dilation_w);
writer.write(fused_activation);
writer.write_array(weights);
writer.write_array(bias);
}
};
struct dequantize_options : public simple_node_body<dequantize_options>
{
memory_range input;
memory_range output;
quant_param_t quant_param;
};
struct matmul_options
{
memory_range input_a;
memory_range input_b;
memory_range output;
int32_t a_rows;
int32_t a_cols;
int32_t b_cols;
value_range<float> fused_activation;
xtl::span<const float> bias;
void deserialize(runtime::span_reader &reader)
{
reader.read(input_a);
reader.read(input_b);
reader.read(output);
reader.read(a_rows);
reader.read(a_cols);
reader.read(b_cols);
reader.read(fused_activation);
reader.read_span(bias, b_cols);
}
void serialize(runtime::binary_writer &writer) const
{
writer.write(input_a);
writer.write(input_b);
writer.write(output);
writer.write(a_rows);
writer.write(a_cols);
writer.write(b_cols);
writer.write(fused_activation);
writer.write_array(bias);
}
};
struct memory_copy_options : public simple_node_body<memory_copy_options>
{
memory_range input;
memory_range output;
};
struct pad_options : public simple_node_body<pad_options>
{
memory_range input;
memory_range output;
runtime_shape_t in_shape;
runtime_paddings_t paddings;
scalar pad_value;
};
struct quantize_options : public simple_node_body<quantize_options>
{
memory_range input;
memory_range output;
quant_param_t quant_param;
};
struct reduce_options : public simple_node_body<reduce_options>
{
memory_range input;
memory_range output;
reduce_op_t reduce_op;
runtime_shape_t in_shape;
runtime_shape_t out_shape;
float init_value;
};
struct reduce_window2d_options : simple_node_body<reduce_window2d_options>
{
memory_range input;
memory_range output;
reduce_op_t reduce_op;
runtime_shape_t in_shape;
padding padding_h;
padding padding_w;
int32_t filter_h;
int32_t filter_w;
int32_t stride_h;
int32_t stride_w;
int32_t dilation_h;
int32_t dilation_w;
float init_value;
value_range<float> fused_activation;
};
struct resize_bilinear_options : public simple_node_body<resize_bilinear_options>
{
memory_range input;
memory_range output;
runtime_shape_t in_shape;
int32_t out_h;
int32_t out_w;
bool align_corners;
};
struct resize_nearest_neighbor_options : public simple_node_body<resize_nearest_neighbor_options>
{
memory_range input;
memory_range output;
runtime_shape_t in_shape;
int32_t out_h;
int32_t out_w;
bool align_corners;
};
struct softmax_options : public simple_node_body<softmax_options>
{
memory_range input;
memory_range output;
int32_t inner_size;
int32_t outer_size;
float beta;
};
struct transpose_options : public simple_node_body<transpose_options>
{
memory_range input;
memory_range output;
runtime_shape_t in_shape;
runtime_shape_t perm;
};
struct strided_slice_options : public simple_node_body<strided_slice_options>
{
memory_range input;
memory_range output;
runtime_shape_t in_shape;
runtime_shape_t begin;
runtime_shape_t end;
runtime_shape_t strides;
int32_t begin_mask;
int32_t end_mask;
int32_t ellipsis_mask;
int32_t new_axis_mask;
int32_t shrink_axis_mask;
};
}
}
}

View File

@ -0,0 +1,24 @@
#pragma once
#include "../runtime/binary_writer.h"
#include "../runtime/span_reader.h"
#include <datatypes.h>
namespace nncase
{
namespace targets
{
template <class T>
struct simple_node_body
{
void deserialize(runtime::span_reader &reader)
{
reader.read(static_cast<T &>(*this));
}
void serialize(runtime::binary_writer &writer) const
{
writer.write(static_cast<const T &>(*this));
}
};
}
}

116
lib/nncase/nncase.cpp Normal file
View File

@ -0,0 +1,116 @@
/* Copyright 2018 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <nncase.h>
#include <runtime/target_config.h>
#include <stdio.h>
using namespace nncase;
using namespace nncase::runtime;
class nncase_context
{
public:
int load_kmodel(const uint8_t *buffer)
{
return interpreter_.try_load_model(buffer) ? 0 : -1;
}
int get_output(uint32_t index, uint8_t **data, size_t *size)
{
if (index >= interpreter_.outputs_size())
return -1;
auto mem = interpreter_.memory_at<uint8_t>(interpreter_.output_at(index));
*data = mem.data();
*size = mem.size();
return 0;
}
int run_kmodel(const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
{
done_callback_ = done_callback;
userdata_ = userdata;
interpreter_.dma_ch(dma_ch);
auto input = interpreter_.input_at(0);
auto mem = interpreter_.memory_at<uint8_t>(input);
std::copy(src, src + mem.size(), mem.begin());
interpreter_.run(done_thunk, on_error_thunk, node_profile_thunk, this);
return 0;
}
private:
void on_done()
{
printf("Total: %fms\n", interpreter_.total_duration().count() / 1e6);
if (done_callback_)
done_callback_(userdata_);
}
static void done_thunk(void *userdata)
{
reinterpret_cast<nncase_context *>(userdata)->on_done();
}
static void on_error_thunk(const char *err, void *userdata)
{
printf("Fatal: %s\n", err);
}
static void node_profile_thunk(runtime_opcode op, std::chrono::nanoseconds duration, void *userdata)
{
printf("%s: %fms\n", node_opcode_names(op).data(), duration.count() / 1e6);
}
private:
interpreter_t interpreter_;
kpu_done_callback_t done_callback_;
void *userdata_;
};
int nncase_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
{
auto nnctx = new (std::nothrow) nncase_context();
if (ctx)
{
ctx->is_nncase = 1;
ctx->nncase_ctx = nnctx;
return nnctx->load_kmodel(buffer);
}
else
{
return -1;
}
}
int nncase_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
{
auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
return nnctx->get_output(index, data, size);
}
void nncase_model_free(kpu_model_context_t *ctx)
{
auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
delete nnctx;
ctx->nncase_ctx = nullptr;
}
int nncase_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
{
auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
return nnctx->run_kmodel(src, dma_ch, done_callback, userdata);
}

View File

@ -0,0 +1,131 @@
#include <cassert>
#include <iostream>
#include <runtime/interpreter.h>
#include <runtime/kernel_registry.h>
using namespace nncase;
using namespace nncase::runtime;
bool interpreter_base::try_load_model(const uint8_t *buffer)
{
auto offset = buffer;
model_header_ = reinterpret_cast<const model_header *>(buffer);
// Validate model
if (model_header_->identifier != MODEL_IDENTIFIER || model_header_->version != MODEL_VERSION || (model_header_->target != MODEL_TARGET_CPU && model_header_->target != MODEL_TARGET_K210))
return false;
// Allocate buffers
main_mem_.reset(new (std::nothrow) uint8_t[model_header_->main_mem]);
if (!main_mem_)
return false;
offset += sizeof(model_header);
inputs_ = { reinterpret_cast<const memory_range *>(offset), inputs_size() };
offset += sizeof(memory_range) * inputs_size();
input_shapes_ = { reinterpret_cast<const runtime_shape_t *>(offset), inputs_size() };
offset += sizeof(runtime_shape_t) * inputs_size();
outputs_ = { reinterpret_cast<const memory_range *>(offset), outputs_size() };
offset += sizeof(memory_range) * outputs_size();
constants_ = { offset, model_header_->constants };
offset += constants_.size();
node_headers_ = { reinterpret_cast<const node_header *>(offset), nodes_size() };
offset += sizeof(node_header) * nodes_size();
node_body_start_ = offset;
return initialize();
}
bool interpreter_base::initialize()
{
return true;
}
void interpreter_base::run(run_callback_t callback, error_callback_t on_error, node_profile_callback_t node_profile, void *userdata)
{
run_callback_ = callback;
on_error_ = on_error;
node_profile_ = node_profile;
userdata_ = userdata;
cnt_node_ = 0;
cnt_node_body_ = node_body_start_;
total_duration_ = {};
last_time_.reset();
step();
}
void interpreter_base::step()
{
auto result = kcr_done;
while (result == kcr_done)
{
if (!last_time_)
{
last_time_ = clock_t::now();
}
else
{
auto now = clock_t::now();
auto duration = now - *last_time_;
total_duration_ += duration;
last_time_ = now;
if (node_profile_)
node_profile_(last_op_, duration, userdata_);
}
if (cnt_node_ == nodes_size())
{
run_callback_(userdata_);
break;
}
else
{
auto node_id = cnt_node_++;
auto header = node_headers_[node_id];
xtl::span<const uint8_t> body(cnt_node_body_, header.body_size);
cnt_node_body_ += header.body_size;
last_op_ = header.opcode;
result = call_kernel(header.opcode, body, static_cast<interpreter_t &>(*this), &interpreter_base::step);
if (result == kcr_error)
{
if (on_error_)
{
char buffer[256];
auto name = node_opcode_names(header.opcode);
if (!name.empty())
std::sprintf(buffer, "error occurs in running kernel: %s", name.data());
else
std::sprintf(buffer, "Unknown opcode: (%d)", header.opcode);
on_error_(buffer, userdata_);
}
break;
}
}
}
}
xtl::span<uint8_t> interpreter_base::memory_at(const memory_range &range) const noexcept
{
uintptr_t base;
switch (range.memory_type)
{
case mem_const:
base = (uintptr_t)constants_.data();
break;
case mem_main:
base = (uintptr_t)main_mem_.get();
break;
default:
base = 0;
assert(!"Invalid memory type");
break;
}
return { reinterpret_cast<uint8_t *>(base + range.start), range.size };
}

View File

@ -0,0 +1,55 @@
#include <runtime/kernel_registry.h>
#include <runtime/span_reader.h>
#include <targets/cpu/cpu_ops_body.h>
#include <targets/k210/k210_ops_body.h>
#include <targets/neutral/neutral_ops_body.h>
using namespace nncase;
using namespace nncase::runtime;
namespace nncase
{
namespace targets
{
#define BEGINE_DEFINE_TARGET(target) \
namespace target \
{
#define DEFINE_RUNTIME_OP(target, id, name, value) \
kernel_call_result id(id##_options &, interpreter_t &, interpreter_step_t);
#define END_DEFINE_TARGET() }
#include <runtime/runtime_op.def>
#undef BEGINE_DEFINE_TARGET
#undef DEFINE_RUNTIME_OP
#undef END_DEFINE_TARGET
}
}
kernel_call_result runtime::call_kernel(runtime_opcode opcode, xtl::span<const uint8_t> body, interpreter_t &interpreter, interpreter_step_t step)
{
span_reader reader(body);
switch (opcode)
{
#define BEGINE_DEFINE_TARGET(...)
#define DEFINE_RUNTIME_OP(target, id, name, value) \
case rop_##id: \
{ \
nncase::targets::target::id##_options options; \
options.deserialize(reader); \
return nncase::targets::target::id(options, interpreter, step); \
}
#define END_DEFINE_TARGET()
#include <runtime/runtime_op.def>
#undef BEGINE_DEFINE_TARGET
#undef DEFINE_RUNTIME_OP
#undef END_DEFINE_TARGET
default:
return kcr_error;
}
}

View File

@ -0,0 +1,79 @@
#include <kernels/cpu/cpu_kernels.h>
#include <runtime/kernel_registry.h>
#include <targets/cpu/cpu_ops_body.h>
using namespace nncase;
using namespace nncase::runtime;
namespace nncase
{
namespace targets
{
namespace cpu
{
kernel_call_result cpu_conv2d(cpu_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto input = interpreter.memory_at<float>(options.input);
auto output = interpreter.memory_at<float>(options.output);
kernels::cpu::conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.out_channels, options.filter_h,
options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation);
return kcr_done;
}
kernel_call_result cpu_depthwise_conv2d(cpu_depthwise_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto input = interpreter.memory_at<float>(options.input);
auto output = interpreter.memory_at<float>(options.output);
kernels::cpu::depthwise_conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.filter_h,
options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation);
return kcr_done;
}
runtime::kernel_call_result cpu_reduce_window2d(cpu_reduce_window2d_options &options, interpreter_t &interpreter, runtime::interpreter_step_t step)
{
auto input = interpreter.memory_at<float>(options.input);
auto output = interpreter.memory_at<float>(options.output);
auto reduce = [&](auto binary_op, auto window_op) {
kernels::cpu::reduce_window2d(input.data(), output.data(), options.init_value, options.in_shape, options.filter_h, options.filter_w, options.stride_h,
options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation, binary_op, window_op);
};
switch (options.reduce_op)
{
case reduce_mean:
reduce([](auto a, auto b) { return a + b; }, [](auto v, auto k) { return v / k; });
return runtime::kcr_done;
case reduce_min:
reduce([](auto a, auto b) { return std::min(a, b); }, [](auto v, auto k) { return v; });
return runtime::kcr_done;
case reduce_max:
reduce([](auto a, auto b) { return std::max(a, b); }, [](auto v, auto k) { return v; });
return kcr_done;
default:
return kcr_error;
}
}
kernel_call_result cpu_quantized_conv2d(cpu_quantized_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto input = interpreter.memory_at<uint8_t>(options.input);
auto output = interpreter.memory_at<uint8_t>(options.output);
kernels::cpu::quantized_conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.out_channels, options.filter_h,
options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w,
options.input_offset, options.filter_offset, options.output_mul, options.output_shift, options.output_offset);
return kcr_done;
}
kernel_call_result cpu_quantized_depthwise_conv2d(cpu_quantized_depthwise_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto input = interpreter.memory_at<uint8_t>(options.input);
auto output = interpreter.memory_at<uint8_t>(options.output);
kernels::cpu::quantized_depthwise_conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.filter_h,
options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w,
options.input_offset, options.filter_offset, options.output_mul, options.output_shift, options.output_offset);
return kcr_done;
}
}
}
}

View File

@ -0,0 +1,36 @@
#include <targets/k210/interpreter.h>
using namespace nncase;
using namespace nncase::runtime;
using namespace nncase::targets::k210;
interpreter::interpreter()
#if NNCASE_TARGET_K210_SIMULATOR
: kpu_mem_(std::make_unique<uint8_t[]>(2 * 1024 * 1024))
#endif
{
#if !NNCASE_TARGET_K210_SIMULATOR
kpu->interrupt_clear.reg = 7;
kpu->interrupt_mask.reg = 7;
kpu->fifo_threshold.reg = 10 | (1 << 4);
kpu->eight_bit_mode.reg = 1;
plic_set_priority(IRQN_AI_INTERRUPT, 1);
#endif
}
xtl::span<uint8_t> interpreter::memory_at(const memory_range &range) const noexcept
{
if (range.memory_type == mem_k210_kpu)
{
uintptr_t base =
#if NNCASE_TARGET_K210_SIMULATOR
(uintptr_t)kpu_mem_.get();
#else
(uintptr_t)AI_IO_BASE_ADDR;
#endif
return { reinterpret_cast<uint8_t *>(base + range.start), range.size };
}
return interpreter_base::memory_at(range);
}

View File

@ -0,0 +1,179 @@
#include <kernels/k210/k210_kernels.h>
#include <runtime/kernel_registry.h>
#include <targets/k210/k210_ops_body.h>
#if !NNCASE_TARGET_K210_SIMULATOR
#include <dmac.h>
#include <sysctl.h>
#endif
using namespace nncase;
using namespace nncase::runtime;
using namespace nncase::targets::k210;
namespace
{
#if !NNCASE_TARGET_K210_SIMULATOR
void kpu_send_layer(const kpu_layer_argument_t &layer)
{
kpu->layer_argument_fifo = layer.interrupt_enabe.reg;
kpu->layer_argument_fifo = layer.image_addr.reg;
kpu->layer_argument_fifo = layer.image_channel_num.reg;
kpu->layer_argument_fifo = layer.image_size.reg;
kpu->layer_argument_fifo = layer.kernel_pool_type_cfg.reg;
kpu->layer_argument_fifo = layer.kernel_load_cfg.reg;
kpu->layer_argument_fifo = layer.kernel_offset.reg;
kpu->layer_argument_fifo = layer.kernel_calc_type_cfg.reg;
kpu->layer_argument_fifo = layer.write_back_cfg.reg;
kpu->layer_argument_fifo = layer.conv_value.reg;
kpu->layer_argument_fifo = layer.conv_value2.reg;
kpu->layer_argument_fifo = layer.dma_parameter.reg;
}
void kpu_conv2d_normal(kpu_layer_argument_t &layer, plic_irq_callback_t callback, void *userdata)
{
kpu->interrupt_clear.reg = 0b111;
kpu->interrupt_mask.reg = 0b110;
layer.interrupt_enabe.data.int_en = 1;
plic_irq_register(IRQN_AI_INTERRUPT, callback, userdata);
plic_irq_enable(IRQN_AI_INTERRUPT);
kpu_send_layer(layer);
}
void kpu_conv2d_output(kpu_layer_argument_t &layer, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata)
{
kpu->interrupt_clear.reg = 0b111;
kpu->interrupt_mask.reg = 0b111;
layer.dma_parameter.data.send_data_out = 1;
sysctl_dma_select((sysctl_dma_channel_t)dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ);
dmac_set_irq(dma_ch, callback, userdata, 1);
dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8);
kpu_send_layer(layer);
}
int kpu_plic_thunk(void *userdata)
{
kpu->interrupt_clear.reg = 0b111;
kpu->interrupt_mask.reg = 0b111;
auto &ctx = *reinterpret_cast<k210_interpreter_context *>(userdata);
(ctx.interpreter->*ctx.step)();
return 0;
}
#endif
}
namespace nncase
{
namespace targets
{
namespace k210
{
kernel_call_result kpu_upload(kpu_upload_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto input = interpreter.memory_at<uint8_t>(options.input);
auto output = interpreter.memory_at<uint8_t>(options.output);
kernels::k210::kpu_upload(input.data(), output.data(), options.in_shape);
return kcr_done;
}
kernel_call_result kpu_conv2d(kpu_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
#if NNCASE_TARGET_K210_SIMULATOR
auto input = interpreter.memory_at<uint8_t>({ mem_k210_kpu, dt_uint8, (uint32_t)options.layer.image_addr.data.image_src_addr * 64, 1 });
auto kpu_out = interpreter.memory_at<uint8_t>({ mem_k210_kpu, dt_uint8, (uint32_t)options.layer.image_addr.data.image_dst_addr * 64, 1 });
auto in_h = static_cast<int32_t>(options.layer.image_size.data.i_col_high + 1);
auto in_w = static_cast<int32_t>(options.layer.image_size.data.i_row_wid + 1);
auto in_ch = static_cast<int32_t>(options.layer.image_channel_num.data.i_ch_num + 1);
runtime_shape_t in_shape { options.batches, in_ch, in_h, in_w };
auto in_fmap_size = kernels::details::compute_size(in_shape);
auto out_h = static_cast<int32_t>(options.layer.image_size.data.o_col_high + 1);
auto out_w = static_cast<int32_t>(options.layer.image_size.data.o_row_wid + 1);
auto out_ch = static_cast<int32_t>(options.layer.image_channel_num.data.o_ch_num + 1);
runtime_shape_t conv_out_shape { options.batches, out_ch, in_h, in_w };
auto conv_out_fmap_size = kernels::details::compute_size(conv_out_shape);
runtime_shape_t out_shape { options.batches, out_ch, out_h, out_w };
auto out_fmap_size = kernels::details::compute_size(out_shape);
auto input_tmp = std::make_unique<uint8_t[]>(in_fmap_size);
auto workspace = std::make_unique<int64_t[]>(conv_out_fmap_size);
auto conv_output_tmp = std::make_unique<uint8_t[]>(conv_out_fmap_size);
auto output_tmp = std::make_unique<uint8_t[]>(out_fmap_size);
kernels::k210::kpu_download(input.data(), input_tmp.get(), in_shape);
auto is_depthwise = options.layer.interrupt_enabe.data.depth_wise_layer != 0;
auto filter_size = get_kpu_filter_size((kpu_filter_type_t)options.layer.kernel_pool_type_cfg.data.kernel_type);
auto pad_value = (uint8_t)options.layer.kernel_pool_type_cfg.data.pad_value;
auto arg_x = (int32_t)kernels::details::to_signed<24>(options.layer.conv_value.data.arg_x);
auto shift_x = (int32_t)options.layer.conv_value.data.shr_x;
auto arg_w = (int32_t)kernels::details::to_signed<24>(options.layer.conv_value.data.arg_w);
auto shift_w = (int32_t)options.layer.conv_value.data.shr_w;
auto arg_add = kernels::details::to_signed<40>(options.layer.conv_value2.data.arg_add);
auto batchnorm = std::make_unique<kpu_batchnorm_segment[]>(out_ch);
for (size_t i = 0; i < out_ch; i++)
{
auto &src = options.batch_norm[i].batchnorm.data;
auto &dest = batchnorm[i];
dest.mul = (int32_t)kernels::details::to_signed<24>(src.norm_mul);
dest.shift = (int32_t)src.norm_shift;
dest.add = (int32_t)kernels::details::to_signed<32>(src.norm_add);
}
kpu_activation_table_t activation;
for (size_t i = 0; i < 16; i++)
{
auto &src = options.activation->activate_para[i].data;
auto &dest = activation[i];
dest.start_x = kernels::details::to_signed<36>(src.x_start);
dest.mul = (int32_t)kernels::details::to_signed<16>(src.y_mul);
dest.shift = (int32_t)src.shift_number;
if (i < 16)
dest.add = options.activation->activate_para_bias0.data.result_bias[i];
else
dest.add = options.activation->activate_para_bias1.data.result_bias[i - 16];
}
#define KPU_CONV2D_IMPL(is_depthwise_val, filter_size_val) \
if (is_depthwise == is_depthwise_val && filter_size == filter_size_val) \
kernels::k210::kpu_conv2d<is_depthwise_val, filter_size_val>(input_tmp.get(), workspace.get(), conv_output_tmp.get(), options.weights.data(), \
in_h, in_w, in_ch, out_ch, pad_value, arg_x, shift_x, arg_w, shift_w, arg_add, batchnorm.get(), activation)
KPU_CONV2D_IMPL(true, 1);
else KPU_CONV2D_IMPL(true, 3);
else KPU_CONV2D_IMPL(false, 1);
else KPU_CONV2D_IMPL(false, 3);
kernels::k210::kpu_pool2d(conv_output_tmp.get(), output_tmp.get(), in_h, in_w, out_ch, (kpu_pool_type_t)options.layer.kernel_pool_type_cfg.data.pool_type);
kernels::k210::kpu_upload(output_tmp.get(), kpu_out.data(), out_shape);
if (options.main_mem_output.size)
{
auto main_output = interpreter.memory_at<uint8_t>(options.main_mem_output);
std::copy(output_tmp.get(), output_tmp.get() + out_fmap_size, main_output.data());
}
return kcr_done;
#else
auto &ctx = interpreter.context();
ctx.interpreter = &interpreter;
ctx.step = step;
if (options.main_mem_output.size)
{
auto main_output = interpreter.memory_at<uint8_t>(options.main_mem_output);
kpu_conv2d_output(options.layer, interpreter.dma_ch(), main_output.data(), kpu_plic_thunk, &ctx);
}
else
{
kpu_conv2d_normal(options.layer, kpu_plic_thunk, &ctx);
}
return kcr_async;
#endif
}
}
}
}

View File

@ -0,0 +1,238 @@
#include <kernels/neutral/neutral_kernels.h>
#include <runtime/kernel_registry.h>
#include <targets/neutral/neutral_ops_body.h>
using namespace nncase;
using namespace nncase::runtime;
#define ELEM_SIZE_IMPL(type, KERNEL) \
switch (runtime::get_bytes(type)) \
{ \
case 1: \
KERNEL(uint8_t); \
break; \
case 2: \
KERNEL(uint16_t); \
break; \
case 4: \
KERNEL(uint32_t); \
break; \
default: \
return kcr_error; \
}
namespace nncase
{
namespace targets
{
namespace neutral
{
kernel_call_result binary(binary_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto input_a = interpreter.memory_at<float>(options.input_a);
auto input_b = interpreter.memory_at<float>(options.input_b);
auto output = interpreter.memory_at<float>(options.output);
auto binary = [&](auto op) {
kernels::neutral::binary(input_a.data(), input_b.data(), output.data(), options.in_a_shape, options.in_b_shape, options.out_shape, options.fused_activation, op);
};
switch (options.binary_op)
{
case binary_add:
binary([](auto a, auto b) { return a + b; });
return kcr_done;
case binary_sub:
binary([](auto a, auto b) { return a - b; });
return kcr_done;
case binary_mul:
binary([](auto a, auto b) { return a * b; });
return kcr_done;
case binary_div:
binary([](auto a, auto b) { return a / b; });
return kcr_done;
default:
return kcr_error;
}
}
kernel_call_result concat(concat_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto output = interpreter.memory_at<uint8_t>(options.output);
kernels::neutral::concat(options.inputs, output.data(), options.dims, options.inner_size, options.outer_size,
[&](const memory_range &range) { return interpreter.memory_at<uint8_t>(range).data(); });
return kcr_done;
}
kernel_call_result conv2d(conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto input = interpreter.memory_at<float>(options.input);
auto output = interpreter.memory_at<float>(options.output);
kernels::neutral::conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.groups, options.out_channels, options.filter_h,
options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation);
return kcr_done;
}
kernel_call_result dequantize(dequantize_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto input = interpreter.memory_at<uint8_t>(options.input);
auto output = interpreter.memory_at<float>(options.output);
kernels::neutral::dequantize(input.data(), output.data(), input.size(), options.quant_param);
return kcr_done;
}
kernel_call_result matmul(matmul_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto input_a = interpreter.memory_at<float>(options.input_a);
auto input_b = interpreter.memory_at<float>(options.input_b);
auto output = interpreter.memory_at<float>(options.output);
kernels::neutral::matmul(input_a.data(), input_b.data(), output.data(), options.bias.data(), options.a_rows, options.a_cols, options.b_cols, options.fused_activation);
return kcr_done;
}
kernel_call_result memory_copy(memory_copy_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto input = interpreter.memory_at<float>(options.input);
auto output = interpreter.memory_at<float>(options.output);
std::copy(input.begin(), input.end(), output.begin());
return kcr_done;
}
kernel_call_result pad(pad_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto input = interpreter.memory_at<uint8_t>(options.input);
auto output = interpreter.memory_at<uint8_t>(options.output);
#define PAD_KERNEL(T) \
kernels::neutral::pad(reinterpret_cast<const T *>(input.data()), reinterpret_cast<T *>(output.data()), options.in_shape, options.paddings, options.pad_value.as<T>());
ELEM_SIZE_IMPL(options.input.datatype, PAD_KERNEL);
return kcr_done;
#undef PAD_KERNEL
}
kernel_call_result quantize(quantize_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto input = interpreter.memory_at<float>(options.input);
auto output = interpreter.memory_at<uint8_t>(options.output);
kernels::neutral::quantize(input.data(), output.data(), input.size(), options.quant_param);
return runtime::kcr_done;
}
kernel_call_result reduce(reduce_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto input = interpreter.memory_at<float>(options.input);
auto output = interpreter.memory_at<float>(options.output);
auto reduce = [&](auto op) {
kernels::neutral::reduce(input.data(), output.data(), options.init_value, options.in_shape, options.out_shape, op);
};
switch (options.reduce_op)
{
case reduce_mean:
{
reduce([](auto a, auto b) { return a + b; });
auto mul = (float)output.size() / input.size();
kernels::neutral::unary(output.data(), output.data(), output.size(), [mul](auto a) { return a * mul; });
return kcr_done;
}
case reduce_min:
reduce([](auto a, auto b) { return std::min(a, b); });
return kcr_done;
case reduce_max:
reduce([](auto a, auto b) { return std::max(a, b); });
return kcr_done;
default:
return kcr_error;
}
}
kernel_call_result reduce_window2d(reduce_window2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto input = interpreter.memory_at<float>(options.input);
auto output = interpreter.memory_at<float>(options.output);
auto reduce = [&](auto binary_op, auto window_op) {
kernels::neutral::reduce_window2d(input.data(), output.data(), options.init_value, options.in_shape, options.filter_h, options.filter_w, options.stride_h,
options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation, binary_op, window_op);
};
switch (options.reduce_op)
{
case reduce_mean:
reduce([](auto a, auto b) { return a + b; }, [](auto v, auto k) { return v / k; });
return kcr_done;
case reduce_min:
reduce([](auto a, auto b) { return std::min(a, b); }, [](auto v, auto k) { return v; });
return kcr_done;
case reduce_max:
reduce([](auto a, auto b) { return std::max(a, b); }, [](auto v, auto k) { return v; });
return kcr_done;
default:
return kcr_error;
}
}
kernel_call_result resize_bilinear(resize_bilinear_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto input = interpreter.memory_at<float>(options.input);
auto output = interpreter.memory_at<float>(options.output);
kernels::neutral::resize_bilinear(input.data(), output.data(), options.in_shape, options.out_h, options.out_w, options.align_corners);
return kcr_done;
}
kernel_call_result resize_nearest_neighbor(resize_nearest_neighbor_options &options, interpreter_t &interpreter, runtime::interpreter_step_t step)
{
auto input = interpreter.memory_at<uint8_t>(options.input);
auto output = interpreter.memory_at<uint8_t>(options.output);
#define RESIZE_NN_KERNEL(T) \
kernels::neutral::resize_nearest_neighbor(reinterpret_cast<const T *>(input.data()), reinterpret_cast<T *>(output.data()), options.in_shape, options.out_h, options.out_w);
ELEM_SIZE_IMPL(options.input.datatype, RESIZE_NN_KERNEL);
return kcr_done;
#undef RESIZE_NN_KERNEL
}
kernel_call_result softmax(softmax_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto input = interpreter.memory_at<float>(options.input);
auto output = interpreter.memory_at<float>(options.output);
kernels::neutral::softmax(input.data(), output.data(), options.beta, options.outer_size, options.inner_size);
return kcr_done;
}
kernel_call_result transpose(transpose_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto input = interpreter.memory_at<uint8_t>(options.input);
auto output = interpreter.memory_at<uint8_t>(options.output);
#define TRANSPOSE_KERNEL(T) \
kernels::neutral::transpose(reinterpret_cast<const T *>(input.data()), reinterpret_cast<T *>(output.data()), options.in_shape, options.perm);
ELEM_SIZE_IMPL(options.input.datatype, TRANSPOSE_KERNEL);
return kcr_done;
#undef TRANSPOSE_KERNEL
}
kernel_call_result strided_slice(strided_slice_options &options, interpreter_t &interpreter, interpreter_step_t step)
{
auto input = interpreter.memory_at<uint8_t>(options.input);
auto output = interpreter.memory_at<uint8_t>(options.output);
#define STRIDED_SLICE_KERNEL(T) \
kernels::neutral::strided_slice(reinterpret_cast<const T *>(input.data()), reinterpret_cast<T *>(output.data()), options.in_shape, options.begin, options.end, options.strides);
ELEM_SIZE_IMPL(options.input.datatype, STRIDED_SLICE_KERNEL);
return kcr_done;
#undef STRIDED_SLICE_KERNEL
}
}
}
}

29
third_party/xtl/LICENSE vendored Normal file
View File

@ -0,0 +1,29 @@
BSD 3-Clause License
Copyright (c) 2017, Sylvain Corlay and Johan Mabille
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

66
third_party/xtl/README.md vendored Normal file
View File

@ -0,0 +1,66 @@
# ![xtl](docs/source/xtl.svg)
[![Travis](https://travis-ci.org/QuantStack/xtl.svg?branch=master)](https://travis-ci.org/QuantStack/xtl)
[![Appveyor](https://ci.appveyor.com/api/projects/status/g9bldap2wirlue9w?svg=true)](https://ci.appveyor.com/project/QuantStack/xtl)
[![Azure](https://dev.azure.com/johanmabille/johanmabille/_apis/build/status/QuantStack.xtl?branchName=master)](https://dev.azure.com/johanmabille/johanmabille/_build/latest?definitionId=1&branchName=master)
[![Documentation Status](http://readthedocs.org/projects/xtl/badge/?version=latest)](https://xtl.readthedocs.io/en/latest/?badge=latest)
[![Join the Gitter Chat](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/QuantStack/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
Basic tools (containers, algorithms) used by other quantstack packages
## Installation
`xtl` is a header-only library. We provide a package for the conda package manager.
```bash
conda install -c conda-forge xtl
```
Or you can directly install it from the sources:
```bash
cmake -DCMAKE_INSTALL_PREFIX=your_install_prefix
make install
```
## Documentation
To get started with using `xtl`, check out the full documentation
http://xtl.readthedocs.io/
## Building the HTML documentation
xtl's documentation is built with three tools
- [doxygen](http://www.doxygen.org)
- [sphinx](http://www.sphinx-doc.org)
- [breathe](https://breathe.readthedocs.io)
While doxygen must be installed separately, you can install breathe by typing
```bash
pip install breathe
```
Breathe can also be installed with `conda`
```bash
conda install -c conda-forge breathe
```
Finally, build the documentation with
```bash
make html
```
from the `docs` subdirectory.
## License
We use a shared copyright model that enables all contributors to maintain the
copyright on their contributions.
This software is licensed under the BSD-3-Clause license. See the [LICENSE](LICENSE) file for details.

20
third_party/xtl/include/xtl/xspan.hpp vendored Normal file
View File

@ -0,0 +1,20 @@
/***************************************************************************
* Copyright (c) 2016, Sylvain Corlay and Johan Mabille *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XTL_XSPAN_HPP
#define XTL_XSPAN_HPP
#include "xspan_impl.hpp"
namespace xtl
{
using tcb::span;
constexpr std::ptrdiff_t dynamic_extent = tcb::dynamic_extent;
}
#endif

View File

@ -0,0 +1,778 @@
// https://github.com/tcbrindle/span/blob/master/include/tcb/span.hpp
// TCP SPAN @commit cd0c6d0
/*
This is an implementation of std::span from P0122R7
http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0122r7.pdf
*/
// Copyright Tristan Brindle 2018.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file ../../LICENSE_1_0.txt or copy at
// https://www.boost.org/LICENSE_1_0.txt)
#ifndef TCB_SPAN_HPP_INCLUDED
#define TCB_SPAN_HPP_INCLUDED
#include <array>
#include <cstddef>
#include <type_traits>
#ifndef TCB_SPAN_NO_EXCEPTIONS
// Attempt to discover whether we're being compiled with exception support
#if !(defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND))
#define TCB_SPAN_NO_EXCEPTIONS
#endif
#endif
#ifndef TCB_SPAN_NO_EXCEPTIONS
#include <cstdio>
#include <stdexcept>
#endif
// Various feature test macros
#ifndef TCB_SPAN_NAMESPACE_NAME
#define TCB_SPAN_NAMESPACE_NAME tcb
#endif
#ifdef TCB_SPAN_STD_COMPLIANT_MODE
#define TCB_SPAN_NO_DEPRECATION_WARNINGS
#endif
#ifndef TCB_SPAN_NO_DEPRECATION_WARNINGS
#define TCB_SPAN_DEPRECATED_FOR(msg) [[deprecated(msg)]]
#else
#define TCB_SPAN_DEPRECATED_FOR(msg)
#endif
#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
#define TCB_SPAN_HAVE_CPP17
#endif
#if __cplusplus >= 201402L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L)
#define TCB_SPAN_HAVE_CPP14
#endif
namespace TCB_SPAN_NAMESPACE_NAME {
// Establish default contract checking behavior
#if !defined(TCB_SPAN_THROW_ON_CONTRACT_VIOLATION) && \
!defined(TCB_SPAN_TERMINATE_ON_CONTRACT_VIOLATION) && \
!defined(TCB_SPAN_NO_CONTRACT_CHECKING)
#if defined(NDEBUG) || !defined(TCB_SPAN_HAVE_CPP14)
#define TCB_SPAN_NO_CONTRACT_CHECKING
#else
#define TCB_SPAN_TERMINATE_ON_CONTRACT_VIOLATION
#endif
#endif
#if defined(TCB_SPAN_THROW_ON_CONTRACT_VIOLATION)
struct contract_violation_error : std::logic_error {
explicit contract_violation_error(const char* msg) : std::logic_error(msg)
{}
};
inline void contract_violation(const char* msg)
{
throw contract_violation_error(msg);
}
#elif defined(TCB_SPAN_TERMINATE_ON_CONTRACT_VIOLATION)
[[noreturn]] inline void contract_violation(const char* /*unused*/)
{
std::terminate();
}
#endif
#if !defined(TCB_SPAN_NO_CONTRACT_CHECKING)
#define TCB_SPAN_STRINGIFY(cond) #cond
#define TCB_SPAN_EXPECT(cond) \
cond ? (void) 0 : contract_violation("Expected " TCB_SPAN_STRINGIFY(cond))
#else
#define TCB_SPAN_EXPECT(cond)
#endif
#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_inline_variables)
#define TCB_SPAN_INLINE_VAR inline
#else
#define TCB_SPAN_INLINE_VAR
#endif
#if defined(TCB_SPAN_HAVE_CPP14) || \
(defined(__cpp_constexpr) && __cpp_constexpr >= 201304)
#define TCB_SPAN_CONSTEXPR14 constexpr
#else
#define TCB_SPAN_CONSTEXPR14
#endif
#if defined(TCB_SPAN_NO_CONTRACT_CHECKING)
#define TCB_SPAN_CONSTEXPR11 constexpr
#else
#define TCB_SPAN_CONSTEXPR11 TCB_SPAN_CONSTEXPR14
#endif
#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_deduction_guides)
#define TCB_SPAN_HAVE_DEDUCTION_GUIDES
#endif
#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_lib_byte)
#define TCB_SPAN_HAVE_STD_BYTE
#endif
#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_lib_array_constexpr)
#define TCB_SPAN_HAVE_CONSTEXPR_STD_ARRAY_ETC
#endif
#if defined(TCB_SPAN_HAVE_CONSTEXPR_STD_ARRAY_ETC)
#define TCB_SPAN_ARRAY_CONSTEXPR constexpr
#else
#define TCB_SPAN_ARRAY_CONSTEXPR
#endif
#ifdef TCB_SPAN_HAVE_STD_BYTE
using byte = std::byte;
#else
using byte = unsigned char;
#endif
TCB_SPAN_INLINE_VAR constexpr std::ptrdiff_t dynamic_extent = -1;
template <typename ElementType, std::ptrdiff_t Extent = dynamic_extent>
class span;
namespace detail {
template <typename E, std::ptrdiff_t S>
struct span_storage {
constexpr span_storage() noexcept = default;
constexpr span_storage(E* ptr, std::ptrdiff_t /*unused*/) noexcept
: ptr(ptr)
{}
E* ptr = nullptr;
static constexpr std::ptrdiff_t size = S;
};
template <typename E>
struct span_storage<E, dynamic_extent> {
constexpr span_storage() noexcept = default;
constexpr span_storage(E* ptr, std::size_t size) noexcept
: ptr(ptr), size(size)
{}
E* ptr = nullptr;
std::size_t size = 0;
};
// Reimplementation of C++17 std::size() and std::data()
#if defined(TCB_SPAN_HAVE_CPP17) || \
defined(__cpp_lib_nonmember_container_access)
using std::data;
using std::size;
#else
template <class C>
constexpr auto size(const C& c) -> decltype(c.size())
{
return c.size();
}
template <class T, std::size_t N>
constexpr std::size_t size(const T (&)[N]) noexcept
{
return N;
}
template <class C>
constexpr auto data(C& c) -> decltype(c.data())
{
return c.data();
}
template <class C>
constexpr auto data(const C& c) -> decltype(c.data())
{
return c.data();
}
template <class T, std::size_t N>
constexpr T* data(T (&array)[N]) noexcept
{
return array;
}
template <class E>
constexpr const E* data(std::initializer_list<E> il) noexcept
{
return il.begin();
}
#endif // TCB_SPAN_HAVE_CPP17
#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_lib_void_t)
using std::void_t;
#else
template <typename...>
using void_t = void;
#endif
template <typename T>
using uncvref_t =
typename std::remove_cv<typename std::remove_reference<T>::type>::type;
template <typename>
struct is_span : std::false_type {};
template <typename T, std::ptrdiff_t S>
struct is_span<span<T, S>> : std::true_type {};
template <typename>
struct is_std_array : std::false_type {};
template <typename T, std::size_t N>
struct is_std_array<std::array<T, N>> : std::true_type {};
template <typename, typename = void>
struct has_size_and_data : std::false_type {};
template <typename T>
struct has_size_and_data<T, void_t<decltype(detail::size(std::declval<T>())),
decltype(detail::data(std::declval<T>()))>>
: std::true_type {};
template <typename C, typename U = uncvref_t<C>>
struct is_container {
static constexpr bool value =
!is_span<U>::value && !is_std_array<U>::value &&
!std::is_array<U>::value && has_size_and_data<C>::value;
};
template <typename T>
using remove_pointer_t = typename std::remove_pointer<T>::type;
template <typename, typename, typename = void>
struct is_container_element_type_compatible : std::false_type {};
template <typename T, typename E>
struct is_container_element_type_compatible<
T, E, void_t<decltype(detail::data(std::declval<T>()))>>
: std::is_convertible<
remove_pointer_t<decltype(detail::data(std::declval<T>()))> (*)[],
E (*)[]> {};
template <typename, typename = size_t>
struct is_complete : std::false_type {};
template <typename T>
struct is_complete<T, decltype(sizeof(T))> : std::true_type {};
} // namespace detail
template <typename ElementType, std::ptrdiff_t Extent>
class span {
static_assert(Extent == dynamic_extent || Extent >= 0,
"A span must have an extent greater than or equal to zero, "
"or a dynamic extent");
static_assert(std::is_object<ElementType>::value,
"A span's ElementType must be an object type (not a "
"reference type or void)");
static_assert(detail::is_complete<ElementType>::value,
"A span's ElementType must be a complete type (not a forward "
"declaration)");
static_assert(!std::is_abstract<ElementType>::value,
"A span's ElementType cannot be an abstract class type");
using storage_type = detail::span_storage<ElementType, Extent>;
public:
// constants and types
using element_type = ElementType;
using value_type = typename std::remove_cv<ElementType>::type;
using index_type = std::size_t;
using difference_type = std::ptrdiff_t;
using pointer = ElementType*;
using reference = ElementType&;
using iterator = pointer;
using const_iterator = const ElementType*;
using reverse_iterator = std::reverse_iterator<iterator>;
using const_reverse_iterator = std::reverse_iterator<const_iterator>;
static constexpr index_type extent = static_cast<index_type>(Extent);
// [span.cons], span constructors, copy, assignment, and destructor
template <std::ptrdiff_t E = Extent,
typename std::enable_if<E <= 0, int>::type = 0>
constexpr span() noexcept
{}
TCB_SPAN_CONSTEXPR11 span(pointer ptr, index_type count)
: storage_(ptr, count)
{
TCB_SPAN_EXPECT(extent == dynamic_extent || count == extent);
}
TCB_SPAN_CONSTEXPR11 span(pointer first_elem, pointer last_elem)
: storage_(first_elem, last_elem - first_elem)
{
TCB_SPAN_EXPECT(extent == dynamic_extent ||
last_elem - first_elem == extent);
}
template <
std::size_t N, std::ptrdiff_t E = Extent,
typename std::enable_if<
(E == dynamic_extent || static_cast<std::ptrdiff_t>(N) == E) &&
detail::is_container_element_type_compatible<
element_type (&)[N], ElementType>::value,
int>::type = 0>
constexpr span(element_type (&arr)[N]) noexcept : storage_(arr, N)
{}
template <
std::size_t N, std::ptrdiff_t E = Extent,
typename std::enable_if<
(E == dynamic_extent || static_cast<std::ptrdiff_t>(N) == E) &&
detail::is_container_element_type_compatible<
std::array<value_type, N>&, ElementType>::value,
int>::type = 0>
TCB_SPAN_ARRAY_CONSTEXPR span(std::array<value_type, N>& arr) noexcept
: storage_(arr.data(), N)
{}
template <
std::size_t N, std::ptrdiff_t E = Extent,
typename std::enable_if<
(E == dynamic_extent || static_cast<std::ptrdiff_t>(N) == E) &&
detail::is_container_element_type_compatible<
const std::array<value_type, N>&, ElementType>::value,
int>::type = 0>
TCB_SPAN_ARRAY_CONSTEXPR span(const std::array<value_type, N>& arr) noexcept
: storage_(arr.data(), N)
{}
template <typename Container,
typename std::enable_if<
detail::is_container<Container>::value &&
detail::is_container_element_type_compatible<
Container&, ElementType>::value,
int>::type = 0>
TCB_SPAN_CONSTEXPR11 span(Container& cont)
: storage_(detail::data(cont), detail::size(cont))
{
TCB_SPAN_EXPECT(extent == dynamic_extent ||
static_cast<std::ptrdiff_t>(detail::size(cont)) ==
extent);
}
template <typename Container,
typename std::enable_if<
detail::is_container<Container>::value &&
detail::is_container_element_type_compatible<
const Container&, ElementType>::value,
int>::type = 0>
TCB_SPAN_CONSTEXPR11 span(const Container& cont)
: storage_(detail::data(cont), detail::size(cont))
{
TCB_SPAN_EXPECT(extent == dynamic_extent ||
static_cast<std::ptrdiff_t>(detail::size(cont)) ==
extent);
}
constexpr span(const span& other) noexcept = default;
template <typename OtherElementType, std::ptrdiff_t OtherExtent,
typename std::enable_if<
(Extent == OtherExtent || Extent == dynamic_extent) &&
std::is_convertible<OtherElementType (*)[],
ElementType (*)[]>::value,
int>::type = 0>
constexpr span(const span<OtherElementType, OtherExtent>& other) noexcept
: storage_(other.data(), other.size())
{}
~span() noexcept = default;
span& operator=(const span& other) noexcept = default;
// [span.sub], span subviews
template <std::ptrdiff_t Count>
TCB_SPAN_CONSTEXPR11 span<element_type, Count> first() const
{
TCB_SPAN_EXPECT(Count >= 0 && Count <= size());
return {data(), Count};
}
template <std::ptrdiff_t Count>
TCB_SPAN_CONSTEXPR11 span<element_type, Count> last() const
{
TCB_SPAN_EXPECT(Count >= 0 && Count <= size());
return {data() + (size() - Count), Count};
}
template <std::ptrdiff_t Offset, std::ptrdiff_t Count = dynamic_extent>
using subspan_return_t =
span<ElementType, Count != dynamic_extent
? Count
: (Extent != dynamic_extent ? Extent - Offset
: dynamic_extent)>;
template <std::ptrdiff_t Offset, std::ptrdiff_t Count = dynamic_extent>
TCB_SPAN_CONSTEXPR11 subspan_return_t<Offset, Count> subspan() const
{
TCB_SPAN_EXPECT((Offset >= 0 && Offset <= size()) &&
(Count == dynamic_extent ||
(Count >= 0 && Offset + Count <= size())));
return {data() + Offset,
Count != dynamic_extent
? Count
: (Extent != dynamic_extent ? Extent - Offset
: size() - Offset)};
}
TCB_SPAN_CONSTEXPR11 span<element_type, dynamic_extent>
first(index_type count) const
{
TCB_SPAN_EXPECT(count >= 0 && count <= size());
return {data(), count};
}
TCB_SPAN_CONSTEXPR11 span<element_type, dynamic_extent>
last(index_type count) const
{
TCB_SPAN_EXPECT(count >= 0 && count <= size());
return {data() + (size() - count), count};
}
TCB_SPAN_CONSTEXPR11 span<element_type, dynamic_extent>
subspan(index_type offset, index_type count = static_cast<index_type>(dynamic_extent)) const
{
TCB_SPAN_EXPECT((offset >= 0 && offset <= size()) &&
(count == dynamic_extent ||
(count >= 0 && offset + count <= size())));
return {data() + offset,
count == dynamic_extent ? size() - offset : count};
}
// [span.obs], span observers
constexpr index_type size() const noexcept { return storage_.size; }
constexpr index_type size_bytes() const noexcept
{
return size() * sizeof(element_type);
}
constexpr bool empty() const noexcept { return size() == 0; }
// [span.elem], span element access
TCB_SPAN_CONSTEXPR11 reference operator[](index_type idx) const
{
TCB_SPAN_EXPECT(idx >= 0 && idx < size());
return *(data() + idx);
}
/* Extension: not in P0122 */
#ifndef TCB_SPAN_STD_COMPLIANT_MODE
TCB_SPAN_CONSTEXPR14 reference at(index_type idx) const
{
#ifndef TCB_SPAN_NO_EXCEPTIONS
if (idx < 0 || idx >= size()) {
char msgbuf[64] = {
0,
};
std::snprintf(msgbuf, sizeof(msgbuf),
"Index %td is out of range for span of size %td", idx,
size());
throw std::out_of_range{msgbuf};
}
#endif // TCB_SPAN_NO_EXCEPTIONS
return this->operator[](idx);
}
TCB_SPAN_CONSTEXPR11 reference front() const
{
TCB_SPAN_EXPECT(!empty());
return *data();
}
TCB_SPAN_CONSTEXPR11 reference back() const
{
TCB_SPAN_EXPECT(!empty());
return *(data() + (size() - 1));
}
#endif // TCB_SPAN_STD_COMPLIANT_MODE
#ifndef TCB_SPAN_NO_FUNCTION_CALL_OPERATOR
TCB_SPAN_DEPRECATED_FOR("Use operator[] instead")
constexpr reference operator()(index_type idx) const
{
return this->operator[](idx);
}
#endif // TCB_SPAN_NO_FUNCTION_CALL_OPERATOR
constexpr pointer data() const noexcept { return storage_.ptr; }
// [span.iterators], span iterator support
constexpr iterator begin() const noexcept { return data(); }
constexpr iterator end() const noexcept { return data() + size(); }
constexpr const_iterator cbegin() const noexcept { return begin(); }
constexpr const_iterator cend() const noexcept { return end(); }
TCB_SPAN_ARRAY_CONSTEXPR reverse_iterator rbegin() const noexcept
{
return reverse_iterator(end());
}
TCB_SPAN_ARRAY_CONSTEXPR reverse_iterator rend() const noexcept
{
return reverse_iterator(begin());
}
TCB_SPAN_ARRAY_CONSTEXPR const_reverse_iterator crbegin() const noexcept
{
return const_reverse_iterator(cend());
}
TCB_SPAN_ARRAY_CONSTEXPR const_reverse_iterator crend() const noexcept
{
return const_reverse_iterator(cbegin());
}
private:
storage_type storage_{};
};
#ifdef TCB_SPAN_HAVE_DEDUCTION_GUIDES
/* Deduction Guides */
template <class T, size_t N>
span(T (&)[N])->span<T, N>;
template <class T, size_t N>
span(std::array<T, N>&)->span<T, N>;
template <class T, size_t N>
span(const std::array<T, N>&)->span<const T, N>;
template <class Container>
span(Container&)->span<typename Container::value_type>;
template <class Container>
span(const Container&)->span<const typename Container::value_type>;
#endif // TCB_HAVE_DEDUCTION_GUIDES
template <typename ElementType, std::ptrdiff_t Extent>
constexpr span<ElementType, Extent>
make_span(span<ElementType, Extent> s) noexcept
{
return s;
}
#define AS_SIGNED(N) static_cast<std::ptrdiff_t>(N)
template <typename T, std::size_t N>
constexpr span<T, AS_SIGNED(N)> make_span(T (&arr)[N]) noexcept
{
return {arr};
}
template <typename T, std::size_t N>
TCB_SPAN_ARRAY_CONSTEXPR span<T, AS_SIGNED(N)> make_span(std::array<T, N>& arr) noexcept
{
return {arr};
}
template <typename T, std::size_t N>
TCB_SPAN_ARRAY_CONSTEXPR span<const T, AS_SIGNED(N)>
make_span(const std::array<T, N>& arr) noexcept
{
return {arr};
}
#undef AS_SIGNED
template <typename Container>
constexpr span<typename Container::value_type> make_span(Container& cont)
{
return {cont};
}
template <typename Container>
constexpr span<const typename Container::value_type>
make_span(const Container& cont)
{
return {cont};
}
/* Comparison operators */
// Implementation note: the implementations of == and < are equivalent to
// 4-legged std::equal and std::lexicographical_compare respectively
template <typename T, std::ptrdiff_t X, typename U, std::ptrdiff_t Y>
TCB_SPAN_CONSTEXPR14 bool operator==(span<T, X> lhs, span<U, Y> rhs)
{
if (lhs.size() != rhs.size()) {
return false;
}
for (std::ptrdiff_t i = 0; i < lhs.size(); i++) {
if (lhs[i] != rhs[i]) {
return false;
}
}
return true;
}
template <typename T, std::ptrdiff_t X, typename U, std::ptrdiff_t Y>
TCB_SPAN_CONSTEXPR14 bool operator!=(span<T, X> lhs, span<U, Y> rhs)
{
return !(lhs == rhs);
}
template <typename T, std::ptrdiff_t X, typename U, std::ptrdiff_t Y>
TCB_SPAN_CONSTEXPR14 bool operator<(span<T, X> lhs, span<U, Y> rhs)
{
// No std::min to avoid dragging in <algorithm>
const std::ptrdiff_t size =
lhs.size() < rhs.size() ? lhs.size() : rhs.size();
for (std::ptrdiff_t i = 0; i < size; i++) {
if (lhs[i] < rhs[i]) {
return true;
}
if (lhs[i] > rhs[i]) {
return false;
}
}
return lhs.size() < rhs.size();
}
template <typename T, std::ptrdiff_t X, typename U, std::ptrdiff_t Y>
TCB_SPAN_CONSTEXPR14 bool operator<=(span<T, X> lhs, span<U, Y> rhs)
{
return !(rhs < lhs);
}
template <typename T, std::ptrdiff_t X, typename U, std::ptrdiff_t Y>
TCB_SPAN_CONSTEXPR14 bool operator>(span<T, X> lhs, span<U, Y> rhs)
{
return rhs < lhs;
}
template <typename T, std::ptrdiff_t X, typename U, std::ptrdiff_t Y>
TCB_SPAN_CONSTEXPR14 bool operator>=(span<T, X> lhs, span<U, Y> rhs)
{
return !(lhs < rhs);
}
template <typename ElementType, std::ptrdiff_t Extent>
span<const byte, ((Extent == dynamic_extent)
? dynamic_extent
: (static_cast<ptrdiff_t>(sizeof(ElementType)) * Extent))>
as_bytes(span<ElementType, Extent> s) noexcept
{
return {reinterpret_cast<const byte*>(s.data()), s.size_bytes()};
}
template <
class ElementType, ptrdiff_t Extent,
typename std::enable_if<!std::is_const<ElementType>::value, int>::type = 0>
span<byte, ((Extent == dynamic_extent)
? dynamic_extent
: (static_cast<ptrdiff_t>(sizeof(ElementType)) * Extent))>
as_writable_bytes(span<ElementType, Extent> s) noexcept
{
return {reinterpret_cast<byte*>(s.data()), s.size_bytes()};
}
/* Extension: nonmember subview operations */
#ifndef TCB_SPAN_STD_COMPLIANT_MODE
template <std::ptrdiff_t Count, typename T>
TCB_SPAN_CONSTEXPR11 auto first(T& t)
-> decltype(make_span(t).template first<Count>())
{
return make_span(t).template first<Count>();
}
template <std::ptrdiff_t Count, typename T>
TCB_SPAN_CONSTEXPR11 auto last(T& t)
-> decltype(make_span(t).template last<Count>())
{
return make_span(t).template last<Count>();
}
template <std::ptrdiff_t Offset, std::ptrdiff_t Count = dynamic_extent,
typename T>
TCB_SPAN_CONSTEXPR11 auto subspan(T& t)
-> decltype(make_span(t).template subspan<Offset, Count>())
{
return make_span(t).template subspan<Offset, Count>();
}
template <typename T>
TCB_SPAN_CONSTEXPR11 auto first(T& t, std::ptrdiff_t count)
-> decltype(make_span(t).first(count))
{
return make_span(t).first(count);
}
template <typename T>
TCB_SPAN_CONSTEXPR11 auto last(T& t, std::ptrdiff_t count)
-> decltype(make_span(t).last(count))
{
return make_span(t).last(count);
}
template <typename T>
TCB_SPAN_CONSTEXPR11 auto subspan(T& t, std::ptrdiff_t offset,
std::ptrdiff_t count = dynamic_extent)
-> decltype(make_span(t).subspan(offset, count))
{
return make_span(t).subspan(offset, count);
}
#endif // TCB_SPAN_STD_COMPLIANT_MODE
} // namespace TCB_SPAN_NAMESPACE_NAME
/* Extension: support for C++17 structured bindings */
#ifndef TCB_SPAN_STD_COMPLIANT_MODE
namespace TCB_SPAN_NAMESPACE_NAME {
template <std::ptrdiff_t N, typename E, std::ptrdiff_t S>
constexpr auto get(span<E, S> s) -> decltype(s[N])
{
return s[N];
}
} // namespace TCB_SPAN_NAMESPACE_NAME
namespace std {
template <typename E, ptrdiff_t S>
class tuple_size<tcb::span<E, S>> : public integral_constant<size_t, static_cast<size_t>(S)> {};
template <typename E>
class tuple_size<tcb::span<E, tcb::dynamic_extent>>; // not defined
template <size_t N, typename E, ptrdiff_t S>
class tuple_element<N, tcb::span<E, S>> {
public:
using type = E;
};
} // end namespace std
#endif // TCB_SPAN_STD_COMPLIANT_MODE
#endif // TCB_SPAN_HPP_INCLUDED