Add nncase v1.0 runtime

pull/125/head
sunnycase 2021-06-09 15:43:37 +08:00
parent 06a2ea71f2
commit d740f55894
118 changed files with 45154 additions and 165 deletions

View File

@ -21,6 +21,10 @@ include(./cmake/macros.internal.cmake)
header_directories(${SDK_ROOT}/lib)
header_directories(src/${PROJ})
header_directories(kendryte-standalone-demo/${PROJ})
add_subdirectory(third_party/gsl-lite)
add_subdirectory(third_party/mpark-variant)
add_subdirectory(third_party/nlohmann_json)
# build library first
add_subdirectory(lib)

View File

@ -113,7 +113,7 @@ SECTIONS
{
PROVIDE_HIDDEN (__init_array_start = .);
KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors)
KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors))
PROVIDE_HIDDEN (__init_array_end = .);
} >ram AT>ram :ram_ro

View File

@ -42,5 +42,5 @@ SET_SOURCE_FILES_PROPERTIES(${ASSEMBLY_FILES} PROPERTIES COMPILE_FLAGS "-x assem
ADD_LIBRARY(kendryte
${LIB_SRC}
)
TARGET_LINK_LIBRARIES(kendryte PUBLIC nncase)
TARGET_LINK_LIBRARIES(kendryte PUBLIC nncase-wrapper)
SET_TARGET_PROPERTIES(kendryte PROPERTIES LINKER_LANGUAGE C)

View File

@ -574,6 +574,7 @@ handle_breakpoint(uintptr_t cause, uintptr_t epc, uintptr_t regs[32], uintptr_t
uintptr_t __attribute__((weak))
handle_misaligned_load(uintptr_t cause, uintptr_t epc, uintptr_t regs[32], uintptr_t fregs[32])
{
dump_core("misaligned load", cause, epc, regs, fregs);
/* notice this function only support 16bit or 32bit instruction */
bool compressed = (*(unsigned short *)epc & 3) != 3;
@ -665,6 +666,7 @@ handle_fault_load(uintptr_t cause, uintptr_t epc, uintptr_t regs[32], uintptr_t
uintptr_t __attribute__((weak))
handle_misaligned_store(uintptr_t cause, uintptr_t epc, uintptr_t regs[32], uintptr_t fregs[32])
{
dump_core("misaligned store", cause, epc, regs, fregs);
/* notice this function only support 16bit or 32bit instruction */
bool compressed = (*(unsigned short *)epc & 3) != 3;

View File

@ -691,6 +691,7 @@ typedef struct
struct
{
void* nncase_ctx;
uint32_t nncase_version;
};
};
} kpu_model_context_t;

View File

@ -1,11 +1,6 @@
include_directories(${SDK_ROOT}/third_party/xtl/include ${CMAKE_CURRENT_LIST_DIR}/nncase/include)
add_subdirectory(v0)
add_subdirectory(v1)
FILE(GLOB_RECURSE NNCASE_SRC
"${CMAKE_CURRENT_LIST_DIR}/*.c"
"${CMAKE_CURRENT_LIST_DIR}/*.cpp"
)
ADD_LIBRARY(nncase
${NNCASE_SRC}
)
TARGET_COMPILE_OPTIONS(nncase PRIVATE -O2)
add_library(nncase-wrapper STATIC nncase.cpp)
target_link_libraries(nncase-wrapper PRIVATE nncase-v0 nncase-v1)
target_include_directories(nncase-wrapper PUBLIC include)

View File

@ -12,172 +12,51 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <nncase.h>
#include <kernels/k210/k210_kernels.h>
#include <runtime/target_interpreter.h>
#include <stdio.h>
#include "v0/nncase_v0.h"
#include "v1/nncase_v1.h"
#include <cstring>
#include <nncase.h>
#include <stdio.h>
#include <utils.h>
using namespace nncase;
using namespace nncase::runtime;
#define NNCASE_DEBUG 0
namespace
extern "C"
{
void kpu_upload_dma(dmac_channel_number_t dma_ch, const uint8_t *src, uint8_t *dest, size_t input_size, plic_irq_callback_t callback, void *userdata)
{
if (is_memory_cache((uintptr_t)src))
struct model_header
{
std::copy_n(src, input_size, dest);
src -= 0x40000000;
uint32_t identifier;
uint32_t version;
};
int nncase_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
{
auto header = reinterpret_cast<const model_header *>(buffer);
if (header->version == 4)
return nncase_v0_load_kmodel(ctx, buffer);
else
return nncase_v1_load_kmodel(ctx, buffer);
}
dmac_set_irq(dma_ch, callback, userdata, 1);
dmac_set_single_mode(dma_ch, (void *)src, (void *)dest, DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
dmac_wait_done(dma_ch);
}
}
class nncase_context
{
public:
int load_kmodel(const uint8_t *buffer)
int nncase_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
{
int ret = interpreter_.try_load_model(buffer) ? 0 : -1;
uint32_t size = interpreter_.model_size(buffer);
uint8_t *buffer_iomem = (uint8_t *)((uintptr_t)buffer - IOMEM);
const uint8_t *buffer_cache = buffer;
memcpy(buffer_iomem, buffer_cache, size);
for (int i = 0; i < size; i++)
{
if (buffer_iomem[i] != buffer_cache[i])
{
printf("flush model fail:%d %x %x \n", i, buffer_iomem[i], buffer_cache[i]);
while (1)
;
}
}
return ret;
if (ctx->nncase_version == 0)
return nncase_v0_get_output(ctx, index, data, size);
else
return nncase_v1_get_output(ctx, index, data, size);
}
int get_output(uint32_t index, uint8_t **data, size_t *size)
void nncase_model_free(kpu_model_context_t *ctx)
{
if (index >= interpreter_.outputs_size())
return -1;
auto mem = interpreter_.memory_at<uint8_t>(interpreter_.output_at(index));
*data = mem.data();
*size = mem.size();
return 0;
if (ctx->nncase_version == 0)
return nncase_v0_model_free(ctx);
else
return nncase_v1_model_free(ctx);
}
int run_kmodel(const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
int nncase_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
{
done_callback_ = done_callback;
userdata_ = userdata;
interpreter_.dma_ch(dma_ch);
auto input = interpreter_.input_at(0);
auto mem = interpreter_.memory_at<uint8_t>(input);
if (input.memory_type == mem_main)
{
std::copy(src, src + mem.size(), mem.begin());
interpreter_.run(done_thunk, on_error_thunk, node_profile_thunk, this);
return 0;
}
else if (input.memory_type == mem_k210_kpu)
{
auto shape = interpreter_.input_shape_at(0);
kernels::k210::kpu_upload(src, mem.data(), shape);
on_upload_done();
return 0;
}
return -1;
}
private:
void on_done()
{
#if NNCASE_DEBUG
printf("Total: %fms\n", interpreter_.total_duration().count() / 1e6);
#endif
if (done_callback_)
done_callback_(userdata_);
}
void on_upload_done()
{
interpreter_.run(done_thunk, on_error_thunk, node_profile_thunk, this);
}
static void done_thunk(void *userdata)
{
reinterpret_cast<nncase_context *>(userdata)->on_done();
}
static void on_error_thunk(const char *err, void *userdata)
{
#if NNCASE_DEBUG
printf("Fatal: %s\n", err);
#endif
}
static void node_profile_thunk(runtime_opcode op, std::chrono::nanoseconds duration, void *userdata)
{
#if NNCASE_DEBUG
printf("%s: %fms\n", node_opcode_names(op).data(), duration.count() / 1e6);
#endif
}
static int upload_done_thunk(void *userdata)
{
reinterpret_cast<nncase_context *>(userdata)->on_upload_done();
return 0;
}
private:
interpreter_t interpreter_;
kpu_done_callback_t done_callback_;
void *userdata_;
};
int nncase_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
{
auto nnctx = new (std::nothrow) nncase_context();
if (ctx)
{
ctx->is_nncase = 1;
ctx->nncase_ctx = nnctx;
return nnctx->load_kmodel(buffer);
}
else
{
return -1;
if (ctx->nncase_version == 0)
return nncase_v0_run_kmodel(ctx, src, dma_ch, done_callback, userdata);
else
return nncase_v1_run_kmodel(ctx, src, dma_ch, done_callback, userdata);
}
}
int nncase_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
{
auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
return nnctx->get_output(index, data, size);
}
void nncase_model_free(kpu_model_context_t *ctx)
{
auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
delete nnctx;
ctx->nncase_ctx = nullptr;
}
int nncase_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
{
auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
return nnctx->run_kmodel(src, dma_ch, done_callback, userdata);
}

View File

@ -0,0 +1,11 @@
include_directories(${SDK_ROOT}/third_party/xtl/include ${CMAKE_CURRENT_LIST_DIR}/nncase/include)
FILE(GLOB_RECURSE NNCASE_SRC
"${CMAKE_CURRENT_LIST_DIR}/*.c"
"${CMAKE_CURRENT_LIST_DIR}/*.cpp"
)
ADD_LIBRARY(nncase-v0
${NNCASE_SRC}
)
TARGET_COMPILE_OPTIONS(nncase-v0 PRIVATE -O2)

View File

@ -0,0 +1,109 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "datatypes.h"
#include <cassert>
#include <cmath>
#include <limits>
namespace nncase
{
namespace quant
{
template <class TIt>
value_range<float> get_range(TIt begin, TIt end)
{
float min = std::numeric_limits<float>::max();
float max = std::numeric_limits<float>::min();
while (begin != end)
{
auto value = *begin++;
auto fc = std::fpclassify(value);
if (fc == FP_NORMAL || fc == FP_SUBNORMAL || fc == FP_ZERO)
{
min = std::min(min, value);
max = std::max(max, value);
}
}
return { min, max };
}
inline value_range<float> fixup_range(value_range<float> range)
{
if (range.min < -1e3)
range.min = -1e3;
if (range.max > 1e3)
range.max = 1e3;
auto r = range.max - range.min;
if (r == 0)
r = 0.1f;
else if (r < 0.01f)
r = 0.01f;
range.max = range.min + r;
if (range.max < 0)
range.max = 0;
if (range.min > 0)
range.min = 0;
return range;
}
inline quant_param_t get_quant_param(value_range<float> range, int32_t bits)
{
range = fixup_range(range);
auto r = range.max - range.min;
auto scale = ((1LL << bits) - 1) / r;
auto bias = std::round(-range.min * scale);
assert(bias >= 0);
return { static_cast<int32_t>(bias), scale };
}
inline fixed_mul get_fixed_mul(float value, int32_t max_bits, uint8_t max_shift, bool is_signed)
{
assert(!is_signed || value >= 0);
auto bits = is_signed ? max_bits - 1 : max_bits;
int32_t shift = 0;
float mul = 0;
if (std::abs(value) > 1)
{
int mul_shift;
mul = std::frexp(value, &mul_shift);
shift = std::min((int32_t)max_shift, bits - mul_shift);
mul = mul * std::pow(2.f, shift + mul_shift);
}
else if (value == 0)
{
mul = 0;
shift = 0;
}
else
{
int mul_shift;
mul = std::frexp(value, &mul_shift);
shift = std::min(max_shift + mul_shift, bits);
mul = mul * std::pow(2.f, shift);
shift -= mul_shift;
}
assert(std::abs(mul) < std::pow(2, bits));
assert(shift >= 0 && shift <= max_shift);
assert(std::abs(value - mul * std::pow(2, -shift)) <= std::numeric_limits<float>::epsilon());
return { mul, static_cast<int8_t>(shift) };
}
}
}

184
lib/nncase/v0/nncase_v0.cpp Normal file
View File

@ -0,0 +1,184 @@
/* Copyright 2019 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <nncase_v0.h>
#include <kernels/k210/k210_kernels.h>
#include <runtime/target_interpreter.h>
#include <stdio.h>
#include <cstring>
#include <utils.h>
using namespace nncase;
using namespace nncase::runtime;
#define NNCASE_DEBUG 0
namespace
{
void kpu_upload_dma(dmac_channel_number_t dma_ch, const uint8_t *src, uint8_t *dest, size_t input_size, plic_irq_callback_t callback, void *userdata)
{
if (is_memory_cache((uintptr_t)src))
{
std::copy_n(src, input_size, dest);
src -= 0x40000000;
}
dmac_set_irq(dma_ch, callback, userdata, 1);
dmac_set_single_mode(dma_ch, (void *)src, (void *)dest, DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
dmac_wait_done(dma_ch);
}
}
class nncase_context
{
public:
int load_kmodel(const uint8_t *buffer)
{
int ret = interpreter_.try_load_model(buffer) ? 0 : -1;
uint32_t size = interpreter_.model_size(buffer);
uint8_t *buffer_iomem = (uint8_t *)((uintptr_t)buffer - IOMEM);
const uint8_t *buffer_cache = buffer;
memcpy(buffer_iomem, buffer_cache, size);
for (int i = 0; i < size; i++)
{
if (buffer_iomem[i] != buffer_cache[i])
{
printf("flush model fail:%d %x %x \n", i, buffer_iomem[i], buffer_cache[i]);
while (1)
;
}
}
return ret;
}
int get_output(uint32_t index, uint8_t **data, size_t *size)
{
if (index >= interpreter_.outputs_size())
return -1;
auto mem = interpreter_.memory_at<uint8_t>(interpreter_.output_at(index));
*data = mem.data();
*size = mem.size();
return 0;
}
int run_kmodel(const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
{
done_callback_ = done_callback;
userdata_ = userdata;
interpreter_.dma_ch(dma_ch);
auto input = interpreter_.input_at(0);
auto mem = interpreter_.memory_at<uint8_t>(input);
if (input.memory_type == mem_main)
{
std::copy(src, src + mem.size(), mem.begin());
interpreter_.run(done_thunk, on_error_thunk, node_profile_thunk, this);
return 0;
}
else if (input.memory_type == mem_k210_kpu)
{
auto shape = interpreter_.input_shape_at(0);
kernels::k210::kpu_upload(src, mem.data(), shape);
on_upload_done();
return 0;
}
return -1;
}
private:
void on_done()
{
#if NNCASE_DEBUG
printf("Total: %fms\n", interpreter_.total_duration().count() / 1e6);
#endif
if (done_callback_)
done_callback_(userdata_);
}
void on_upload_done()
{
interpreter_.run(done_thunk, on_error_thunk, node_profile_thunk, this);
}
static void done_thunk(void *userdata)
{
reinterpret_cast<nncase_context *>(userdata)->on_done();
}
static void on_error_thunk(const char *err, void *userdata)
{
#if NNCASE_DEBUG
printf("Fatal: %s\n", err);
#endif
}
static void node_profile_thunk(runtime_opcode op, std::chrono::nanoseconds duration, void *userdata)
{
#if NNCASE_DEBUG
printf("%s: %fms\n", node_opcode_names(op).data(), duration.count() / 1e6);
#endif
}
static int upload_done_thunk(void *userdata)
{
reinterpret_cast<nncase_context *>(userdata)->on_upload_done();
return 0;
}
private:
interpreter_t interpreter_;
kpu_done_callback_t done_callback_;
void *userdata_;
};
int nncase_v0_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
{
auto nnctx = new (std::nothrow) nncase_context();
if (ctx)
{
ctx->is_nncase = 1;
ctx->nncase_ctx = nnctx;
ctx->nncase_version = 0;
return nnctx->load_kmodel(buffer);
}
else
{
return -1;
}
}
int nncase_v0_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
{
auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
return nnctx->get_output(index, data, size);
}
void nncase_v0_model_free(kpu_model_context_t *ctx)
{
auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
delete nnctx;
ctx->nncase_ctx = nullptr;
}
int nncase_v0_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
{
auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
return nnctx->run_kmodel(src, dma_ch, done_callback, userdata);
}

33
lib/nncase/v0/nncase_v0.h Normal file
View File

@ -0,0 +1,33 @@
/* Copyright 2019 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _NNCASE_V0_H
#define _NNCASE_V0_H
#include "kpu.h"
#ifdef __cplusplus
extern "C" {
#endif
int nncase_v0_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer);
int nncase_v0_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size);
void nncase_v0_model_free(kpu_model_context_t *ctx);
int nncase_v0_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,6 @@
set(nncaseruntime_DIR ${CMAKE_CURRENT_LIST_DIR}/lib/cmake/nncaseruntime)
find_package(nncaseruntime REQUIRED)
add_library(nncase-v1 STATIC nncase_v1.cpp)
target_link_libraries(nncase-v1 PRIVATE -Wl,-start-group nncaseruntime nncase_rt_modules_k210 -Wl,-end-group)

View File

@ -0,0 +1,28 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nncase/runtime/datatypes.h>
#include <nncase/runtime/error.h>
#include <nncase/runtime/result.h>
#include <nncase/kernels/kernel_context.h>
BEGIN_NS_NNCASE_KERNELS
NNCASE_API result<void> conv2d(const float *input, const float *weights, const float *bias, float *output,
const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &w_shape, const runtime_shape_t &w_strides,
const runtime_shape_t &bias_strides, const runtime_shape_t &out_strides, const padding &padding_h, const padding &padding_w,
int32_t groups, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, value_range<float> fused_activation, kernel_context &context = default_kernel_context) noexcept;
END_NS_NNCASE_KERNELS

View File

@ -0,0 +1,28 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "runtime_types.h"
#include <nncase/kernels/kernel_context.h>
#include <nncase/kernels/kernel_utils.h>
#include <utility>
BEGIN_NS_NNCASE_KERNELS_CPU_OPT
result<void> conv2d(const float *input, const float *weights, const float *bias, float *output,
const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &w_shape, const runtime_shape_t &w_strides,
const runtime_shape_t &bias_strides, const runtime_shape_t &out_strides, const padding &padding_h, const padding &padding_w,
int32_t groups, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, value_range<float> fused_activation, kernel_context &context) noexcept;
END_NS_NNCASE_KERNELS_CPU_OPT

View File

@ -0,0 +1,54 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nncase/kernels/kernel_context.h>
#include <nncase/runtime/datatypes.h>
#include <nncase/runtime/error.h>
#include <nncase/runtime/result.h>
#define BEGIN_NS_NNCASE_KERNELS_CPU_OPT \
namespace nncase \
{ \
namespace kernels \
{ \
namespace cpu \
{ \
namespace optimized \
{
#define END_NS_NNCASE_KERNELS_CPU_OPT \
} \
} \
} \
}
#define TYPE_IMPL_SELECT(type, IMPL) \
switch (runtime::get_bytes(type)) \
{ \
IMPL(1, uint8_t); \
IMPL(2, uint16_t); \
IMPL(4, uint32_t); \
IMPL(8, uint64_t); \
default: \
return err(std::errc::not_supported); \
}
enum copy_impl_select
{
all_contiguous,
src_contiguous,
dest_contiguous
};

View File

@ -0,0 +1,33 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "runtime_types.h"
#include <cstring>
BEGIN_NS_NNCASE_KERNELS_CPU_OPT
NNCASE_API result<void> concat(datatype_t type, gsl::span<const gsl::byte *const> inputs, gsl::byte *output, const runtime_shape_t &out_shape,
gsl::span<const runtime_shape_t> in_strides, const runtime_shape_t &out_strides, size_t axis, const runtime_shape_t &concat_dims,
kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> slice(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const runtime_shape_t &begins, const runtime_shape_t &ends, const runtime_axis_t &strides,
kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> copy(datatype_t type, const gsl::byte *src, gsl::byte *dest,
const runtime_shape_t &shape, const runtime_shape_t &src_strides, const runtime_shape_t &dest_strides,
int dims_offset, copy_impl_select impl_select, kernel_context &context = default_kernel_context) noexcept;
END_NS_NNCASE_KERNELS_CPU_OPT

View File

@ -0,0 +1,26 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nncase/kernels/kernel_context.h>
#include "runtime_types.h"
BEGIN_NS_NNCASE_KERNELS_CPU_REF
NNCASE_API result<void> conv2d(const float *input, const float *weights, const float *bias, float *output,
const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &w_shape, const runtime_shape_t &w_strides,
const runtime_shape_t &bias_strides, const runtime_shape_t &out_strides, const padding &padding_h, const padding &padding_w,
int32_t groups, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, value_range<float> fused_activation, kernel_context &context = default_kernel_context) noexcept;
END_NS_NNCASE_KERNELS_CPU_REF

View File

@ -0,0 +1,23 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "runtime_types.h"
#include <nncase/kernels/kernel_context.h>
BEGIN_NS_NNCASE_KERNELS_CPU_REF
NNCASE_API result<void> nnil_unary_method(const float *input, float *output, size_t count, gsl::span<const gsl::byte> body, kernel_context &context = default_kernel_context) noexcept;
END_NS_NNCASE_KERNELS_CPU_REF

View File

@ -0,0 +1,25 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "runtime_types.h"
#include <nncase/kernels/kernel_context.h>
BEGIN_NS_NNCASE_KERNELS_CPU_REF
NNCASE_API result<void> reduce_window2d(reduce_op_t op, const float *input, float init_value, float *output, const runtime_shape_t &in_shape,
const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const padding &padding_h, const padding &padding_w,
int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, value_range<float> fused_activation, kernel_context &context = default_kernel_context) noexcept;
END_NS_NNCASE_KERNELS_CPU_REF

View File

@ -0,0 +1,72 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nncase/runtime/datatypes.h>
#include <nncase/runtime/error.h>
#include <nncase/runtime/result.h>
#define BEGIN_NS_NNCASE_KERNELS_CPU_REF \
namespace nncase \
{ \
namespace kernels \
{ \
namespace cpu \
{ \
namespace reference \
{
#define END_NS_NNCASE_KERNELS_CPU_REF \
} \
} \
} \
}
BEGIN_NS_NNCASE_KERNELS_CPU_REF
namespace detail
{
template <class Callable>
result<void> apply_impl(Callable &&callable, runtime_shape_t index_prefix, runtime_shape_t::const_iterator index_begin, runtime_shape_t::const_iterator index_end) noexcept
{
const auto head = *index_begin++;
index_prefix.push_back(0);
if (index_begin == index_end)
{
for (size_t i = 0; i < head; i++)
{
index_prefix.back() = i;
try_(callable(index_prefix));
}
}
else
{
for (size_t i = 0; i < head; i++)
{
index_prefix.back() = i;
try_(apply_impl(std::forward<Callable>(callable), index_prefix, index_begin, index_end));
}
}
return ok();
}
}
template <class Callable>
result<void> apply(const runtime_shape_t &shape, Callable &&callable) noexcept
{
return detail::apply_impl(std::forward<Callable>(callable), runtime_shape_t(), shape.cbegin(), shape.cend());
}
END_NS_NNCASE_KERNELS_CPU_REF

View File

@ -0,0 +1,70 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nncase/kernels/kernel_context.h>
#include "runtime_types.h"
BEGIN_NS_NNCASE_KERNELS_CPU_REF
NNCASE_API result<void> batch_to_space(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
const runtime_shape_t &block_shape, const runtime_paddings_t &crops, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides,
kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> broadcast(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
const runtime_shape_t &in_strides, const runtime_shape_t &out_shape, const runtime_shape_t &out_strides, kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> concat(datatype_t type, gsl::span<const gsl::byte *const> inputs, gsl::byte *output, const runtime_shape_t &out_shape,
gsl::span<const runtime_shape_t> in_strides, const runtime_shape_t &out_strides, size_t axis, const runtime_shape_t &concat_dims,
kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> convert(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> copy(datatype_t type, const gsl::byte *src, gsl::byte *dest,
const runtime_shape_t &shape, const runtime_shape_t &src_strides, const runtime_shape_t &dest_strides, kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> transpose(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
const runtime_shape_t &perm, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> binary(binary_op_t op, const float *input_a, const float *input_b, float *output,
const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
const runtime_shape_t &in_b_strides, const runtime_shape_t &out_strides, value_range<float> fused_activation, kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> dequantize(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, float scale, float bias,
kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> lut1d(datatype_t type, const gsl::byte *input, const gsl::byte *table, gsl::byte *output, const runtime_shape_t &shape,
const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const scalar &min, const scalar &max) noexcept;
NNCASE_API result<void> pad(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const runtime_paddings_t &paddings, pad_mode_t mode, const scalar &pad_value,
kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> quantize(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, float scale, float bias,
kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> unary(unary_op_t op, const float *input, float *output, const runtime_shape_t &shape,
const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> reduce(reduce_op_t op, float init_value, const float *input, float *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> slice(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const runtime_shape_t &begins, const runtime_shape_t &ends, const runtime_axis_t &strides,
kernel_context &context = default_kernel_context) noexcept;
END_NS_NNCASE_KERNELS_CPU_REF

View File

@ -0,0 +1,321 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nncase/kernels/kernel_utils.h>
#include <nncase/runtime/k210/compiler_defs.h>
#include <nncase/runtime/k210/runtime_op_utility.h>
#include <nncase/runtime/k210/runtime_types.h>
#include <nncase/runtime/result.h>
#include <nncase/runtime/runtime_op_utility.h>
BEGIN_NS_NNCASE_KERNELS_K210
namespace detail
{
template <class T>
struct pool_partial_type;
template <>
struct pool_partial_type<uint8_t>
{
using type = uint32_t;
};
template <>
struct pool_partial_type<float>
{
using type = float;
};
template <class T>
using pool_partial_type_t = typename pool_partial_type<T>::type;
}
result<void> kpu_upload(const uint8_t *src, uint8_t *dest, const runtime::k210::kpu_shape_t &in_shape, uint32_t dma_ch);
inline result<void> kpu_download(const uint8_t *src, uint8_t *dest, const runtime::k210::kpu_shape_t &in_shape)
{
using namespace runtime::k210;
if (in_shape[3] % 64 == 0)
{
std::copy(src, src + kernels::detail::compute_size(in_shape), dest);
}
else
{
auto layout = get_kpu_row_layout(in_shape[3]);
auto fmap_size = get_kpu_bytes(in_shape[3], in_shape[2], in_shape[1]);
for (uint32_t batch = 0; batch < in_shape[0]; batch++)
{
auto batch_origin = src + (size_t)batch * fmap_size;
for (uint32_t oc = 0; oc < in_shape[1]; oc++)
{
auto channel_origin = batch_origin + (size_t)oc / layout.groups * layout.row_len * in_shape[2] * 64 + (size_t)oc % layout.groups * layout.row_pitch;
for (uint32_t y = 0; y < in_shape[2]; y++)
{
auto y_origin = channel_origin + (size_t)y * layout.row_len * 64;
for (uint32_t x = 0; x < in_shape[3]; x++)
*dest++ = y_origin[x];
}
}
}
}
return ok();
}
template <bool IsDepthwise, int32_t FilterSize>
void kpu_conv2d(const uint8_t *input, int64_t *workspace, uint8_t *output, const uint8_t *weights, int32_t in_h, int32_t in_w, int32_t in_channels, int32_t out_channels, uint8_t pad_value, int32_t arg_x,
int32_t shift_x, int32_t arg_w, int32_t shift_w, int64_t arg_add, const runtime::k210::kpu_batchnorm_segment *batchnorm, const runtime::k210::kpu_activation_table_t &activation)
{
const auto channel_size = size_t(in_h) * in_w;
// conv
{
auto out_it = workspace;
const auto pad = FilterSize == 1 ? 0 : 1;
const auto groups = IsDepthwise ? out_channels : 1;
const auto g_ic = IsDepthwise ? 1 : in_channels / groups;
const auto g_oc = IsDepthwise ? 1 : out_channels;
for (int32_t og = 0; og < groups; og++)
{
const uint8_t *w_group_p = weights + (size_t)og * g_oc * g_ic * FilterSize * FilterSize;
for (int32_t oc = 0; oc < g_oc; oc++)
{
const uint8_t *w_oc_p = w_group_p + (size_t)oc * g_ic * FilterSize * FilterSize;
for (int32_t oy = 0; oy < in_h; oy++)
{
for (int32_t ox = 0; ox < in_w; ox++)
{
const int32_t in_y_origin = oy - pad;
const int32_t in_x_origin = ox - pad;
int64_t value = 0;
int64_t sum_x = 0, sum_w = 0;
for (int32_t ic = 0; ic < g_ic; ic++)
{
const uint8_t *in_c_p = input + ((size_t)og * g_ic + ic) * in_h * in_w;
const uint8_t *w_ic_p = w_oc_p + (size_t)ic * FilterSize * FilterSize;
for (int32_t ky = 0; ky < FilterSize; ky++)
{
for (int32_t kx = 0; kx < FilterSize; kx++)
{
const int32_t in_y = in_y_origin + ky;
const int32_t in_x = in_x_origin + kx;
uint8_t x;
if (in_x < 0 || in_x >= in_w
|| in_y < 0 || in_y >= in_h)
x = pad_value;
else
x = in_c_p[in_y * in_w + in_x];
uint8_t w = w_ic_p[ky * FilterSize + kx];
sum_x += x;
sum_w += w;
value += (int32_t)x * w;
}
}
}
*out_it++ = value + (arg_x * sum_x >> shift_x) + (arg_w * sum_w >> shift_w) + arg_add * g_ic;
}
}
}
}
}
// bn act
{
auto src_it = workspace;
auto out_it = output;
for (int32_t oc = 0; oc < out_channels; oc++)
{
const auto &bn = batchnorm[oc];
for (size_t i = 0; i < channel_size; i++)
{
auto value = (*src_it++ * bn.mul >> bn.shift) + bn.add;
auto &seg = *std::find_if(activation.rbegin(), activation.rend(), [value](const runtime::k210::kpu_activation_segment &seg) {
return value > seg.start_x;
});
auto act_value = runtime::carry_shift<int64_t, true>((value - seg.start_x) * seg.mul, seg.shift) + seg.add;
*out_it++ = (uint8_t)kernels::detail::clamp(act_value, int64_t(0), int64_t(255));
}
}
}
}
template <class T>
inline void kpu_pool2d(const T *input, T *output, int32_t in_h, int32_t in_w, int32_t in_channels, runtime::k210::kpu_pool_type_t pool_type)
{
using namespace runtime::k210;
using partial_t = detail::pool_partial_type_t<T>;
const auto filter = get_kpu_filter_size(pool_type);
const auto stride = get_kpu_filter_stride(pool_type);
const auto out_h = get_kpu_pool_output_size(in_h, pool_type);
const auto out_w = get_kpu_pool_output_size(in_w, pool_type);
for (int32_t oc = 0; oc < in_channels; oc++)
{
auto in_c_p = input + (size_t)oc * in_h * in_w;
for (int32_t oy = 0; oy < out_h; oy++)
{
for (int32_t ox = 0; ox < out_w; ox++)
{
const int32_t in_y_origin = oy * stride;
const int32_t in_x_origin = ox * stride;
partial_t value = 0;
switch (pool_type)
{
case kpu_pool_bypass:
{
const int32_t in_y = in_y_origin;
const int32_t in_x = in_x_origin;
value = in_c_p[in_y * in_w + in_x];
break;
}
case kpu_pool_max_2_s2:
case kpu_pool_max_2_s1:
case kpu_pool_max_4_s4:
{
value = std::numeric_limits<T>::lowest();
for (int32_t ky = 0; ky < filter; ky++)
{
for (int32_t kx = 0; kx < filter; kx++)
{
const int32_t in_y = in_y_origin + ky;
const int32_t in_x = in_x_origin + kx;
partial_t in_v;
if (in_y < 0 || in_y >= in_h || in_x < 0 || in_x >= in_w)
in_v = std::numeric_limits<T>::lowest();
else
in_v = in_c_p[in_y * in_w + in_x];
value = std::max(value, in_v);
}
}
break;
}
case kpu_pool_mean_2_s2:
case kpu_pool_mean_2_s1:
case kpu_pool_mean_4_s4:
{
for (int32_t ky = 0; ky < filter; ky++)
{
for (int32_t kx = 0; kx < filter; kx++)
{
const int32_t in_y = kernels::detail::clamp(in_y_origin + ky, 0, in_h - 1);
const int32_t in_x = kernels::detail::clamp(in_x_origin + kx, 0, in_w - 1);
const T in_v = in_c_p[in_y * in_w + in_x];
value += in_v;
}
}
value /= filter * filter;
break;
}
case kpu_pool_left_top_2_s2:
case kpu_pool_left_top_4_s4:
case kpu_pool_right_top_2_s2:
{
auto k_off = get_kpu_select_pool_offset(pool_type);
const int32_t in_y = in_y_origin + k_off[0];
const int32_t in_x = in_x_origin + k_off[1];
partial_t in_v;
if (in_y < 0 || in_y >= in_h || in_x < 0 || in_x >= in_w)
in_v = 0;
else
in_v = in_c_p[in_y * in_w + in_x];
value = in_v;
break;
}
}
*output++ = (T)value;
}
}
}
}
template <bool IsDepthwise, int32_t FilterSize>
void fake_kpu_conv2d(const float *input, float *output, const float *weights, const float *bias, int32_t in_h, int32_t in_w, int32_t in_channels, int32_t out_channels, value_range<float> fused_activation)
{
const auto pad = FilterSize == 1 ? 0 : 1;
const auto groups = IsDepthwise ? out_channels : 1;
const auto g_ic = IsDepthwise ? 1 : in_channels / groups;
const auto g_oc = IsDepthwise ? 1 : out_channels;
for (int32_t og = 0; og < groups; og++)
{
const auto *w_group_p = weights + (size_t)og * g_oc * g_ic * FilterSize * FilterSize;
for (int32_t oc = 0; oc < g_oc; oc++)
{
const auto *w_oc_p = w_group_p + (size_t)oc * g_ic * FilterSize * FilterSize;
for (int32_t oy = 0; oy < in_h; oy++)
{
for (int32_t ox = 0; ox < in_w; ox++)
{
const int32_t in_y_origin = oy - pad;
const int32_t in_x_origin = ox - pad;
const int32_t filter_y_start = std::max(0, -in_y_origin);
const int32_t filter_y_end = std::min(FilterSize, in_h - in_y_origin);
const int32_t filter_x_start = std::max(0, -in_x_origin);
const int32_t filter_x_end = std::min(FilterSize, in_w - in_x_origin);
float value = bias[og * g_oc + oc];
for (int32_t ic = 0; ic < g_ic; ic++)
{
const auto *in_c_p = input + ((size_t)og * g_ic + ic) * in_h * in_w;
const auto *w_ic_p = w_oc_p + (size_t)ic * FilterSize * FilterSize;
for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
{
for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
{
const int32_t in_y = in_y_origin + ky;
const int32_t in_x = in_x_origin + kx;
const auto in_v = in_c_p[in_y * in_w + in_x];
const auto w = w_ic_p[ky * FilterSize + kx];
value += in_v * w;
}
}
}
*output++ = kernels::detail::apply_activation(value, fused_activation);
}
}
}
}
}
END_NS_NNCASE_KERNELS_K210

View File

@ -0,0 +1,27 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nncase/runtime/result.h>
BEGIN_NS_NNCASE_KERNELS
struct NNCASE_API kernel_context
{
};
NNCASE_UNUSED static kernel_context default_kernel_context;
END_NS_NNCASE_KERNELS

View File

@ -0,0 +1,240 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <nncase/runtime/datatypes.h>
#include <numeric>
#ifdef __GNUC__
#define CXX_RESTRICT __restrict__
#elif _MSC_VER
#define CXX_RESTRICT __restrict
#else
#define CXX_RESTRICT
#endif
BEGIN_NS_NNCASE_KERNELS
template <class offset_type, class S, class It>
inline offset_type element_offset(const S &strides, It first, It last) noexcept
{
using difference_type = typename std::iterator_traits<It>::difference_type;
auto size = static_cast<difference_type>((std::min)(static_cast<typename S::size_type>(std::distance(first, last)), strides.size()));
return std::inner_product(last - size, last, strides.cend() - size, offset_type(0));
}
template <class TShape>
size_t offset(const TShape &strides, const TShape &index)
{
assert(strides.size() == index.size());
return element_offset<size_t>(strides, index.begin(), index.end());
}
template <class TShape>
TShape reshape_linear_index(const TShape &new_shape, size_t index)
{
TShape new_index(new_shape.size());
size_t i = new_shape.size() - 1;
for (auto it = new_shape.rbegin(); it != new_shape.rend(); ++it)
{
new_index[i--] = index % *it;
index /= *it;
}
return new_index;
}
template <class TShape>
size_t linear_index(const TShape &shape, const TShape &index)
{
assert(index.size() == shape.size());
size_t new_index = index[0];
for (size_t i = 1; i < shape.size(); i++)
new_index = new_index * shape[i] + index[i];
return new_index;
}
namespace detail
{
inline size_t get_windowed_output_size(size_t size, int32_t filter, int32_t stride, int32_t dilation, const padding &padding)
{
auto effective_filter_size = (filter - 1) * dilation + 1;
return (size_t)((int32_t)size + padding.before + padding.after - effective_filter_size + stride) / stride;
}
inline runtime_shape_t get_binary_output_shape(const runtime_shape_t &input_a_shape, const runtime_shape_t &input_b_shape)
{
runtime_shape_t out_shape;
const auto dest_dims = (int32_t)std::max(input_a_shape.size(), input_b_shape.size());
const auto in_a_ext = dest_dims - (int32_t)input_a_shape.size();
const auto in_b_ext = dest_dims - (int32_t)input_b_shape.size();
for (int32_t i = 0; i < dest_dims; i++)
{
const auto in_a_dim = i - (int32_t)in_a_ext;
const auto in_b_dim = i - (int32_t)in_b_ext;
const auto in_a = in_a_dim < 0 ? 1 : input_a_shape[in_a_dim];
const auto in_b = in_b_dim < 0 ? 1 : input_b_shape[in_b_dim];
if (in_a == in_b)
out_shape.push_back(in_a);
else if (in_a == 1)
out_shape.push_back(in_b);
else if (in_b == 1)
out_shape.push_back(in_a);
else
assert(!"inputs are not compatible to broadcast");
}
return out_shape;
}
template <class TShape>
size_t compute_size(const TShape &shape)
{
return std::accumulate(shape.begin(), shape.end(), size_t(1), std::multiplies<void>());
}
template <class T>
inline T clamp(T value, T min, T max)
{
return std::max(std::min(value, max), min);
}
template <class T>
inline T apply_activation(T value, value_range<T> activation)
{
return clamp(value, activation.min, activation.max);
}
template <class TShape>
TShape get_reduced_offset(const TShape &in_offset, const TShape &reduced_shape)
{
TShape off(reduced_shape.size());
const auto dims_ext = in_offset.size() - reduced_shape.size();
for (size_t i = 0; i < reduced_shape.size(); i++)
{
if (in_offset[i + dims_ext] >= reduced_shape[i])
off[i] = 0;
else
off[i] = in_offset[i + dims_ext];
}
return off;
}
template <class TShape>
TShape get_reduced_shape(const TShape &in_shape, const TShape &axis, bool keep_dims)
{
TShape shape;
shape.reserve(in_shape.size() - (keep_dims ? 0 : axis.size()));
for (size_t i = 0; i < in_shape.size(); i++)
{
if (std::find(axis.begin(), axis.end(), i) == axis.end())
{
shape.push_back(in_shape[i]);
}
else
{
if (keep_dims)
shape.push_back(1);
}
}
if (shape.empty())
shape.push_back(1);
return shape;
}
template <class TShape>
size_t get_reduce_block_size(const TShape &in_shape, const TShape &axis)
{
size_t size = 1;
for (size_t i = 0; i < in_shape.size(); i++)
{
if (std::find(axis.begin(), axis.end(), i) != axis.end())
{
size *= in_shape[i];
}
}
return size;
}
template <class TShape>
TShape get_reduced_offset(const TShape &in_offset, const TShape &axis, bool keep_dims)
{
TShape off;
off.reserve(in_offset.size() - (keep_dims ? 0 : axis.size()));
for (size_t i = 0; i < in_offset.size(); i++)
{
if (std::find(axis.begin(), axis.end(), i) == axis.end())
{
off.push_back(in_offset[i]);
}
else
{
if (keep_dims)
off.push_back(0);
}
}
if (off.empty())
off.push_back(0);
return off;
}
template <class T, class TRange>
struct default_ptr_getter
{
T *operator()(const TRange &range) const noexcept { return range; }
};
template <int32_t Bits>
int32_t to_signed(uint32_t value)
{
auto mask = uint32_t(1) << (Bits - 1);
if (Bits != 32 && (value & mask) != 0)
{
auto sign = 0xFFFFFFFF << Bits;
return (int)(value | sign);
}
return (int32_t)value;
}
template <int32_t Bits>
int64_t to_signed(uint64_t value)
{
auto mask = uint64_t(1) << (Bits - 1);
if ((value & mask) != 0)
{
auto sign = 0xFFFFFFFFFFFFFFFF << Bits;
return (int64_t)(value | sign);
}
return (int64_t)value;
}
template <class T>
constexpr T quantize(float value, const quant_param_t &param) noexcept
{
return (T)clamp((int32_t)lrintf(value / param.scale + param.zero_point), (int32_t)std::numeric_limits<T>::lowest(), (int32_t)std::numeric_limits<T>::max());
}
}
END_NS_NNCASE_KERNELS

View File

@ -0,0 +1,795 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "../kernel_utils.h"
#include <cmath>
#include <nncase/runtime/nnil.h>
#include <nncase/runtime/runtime_op_utility.h>
#include <xtl/xspan.hpp>
#ifdef __riscv
#include "../riscv/neutral_kernels.h"
#endif
namespace nncase::kernels::neutral
{
template <class TOp, class TShape>
void binary(const float *input_a, const float *input_b, float *output, const TShape &in_a_shape,
const TShape &in_b_shape, const TShape &out_shape, const value_range<float> &fused_activation, TOp &&op)
{
// opt. no broadcast
if (in_a_shape == in_b_shape)
{
auto size = kernels::detail::compute_size(in_a_shape);
for (size_t i = 0; i < size; i++)
{
const auto a = input_a[i];
const auto b = input_b[i];
output[i] = kernels::detail::apply_activation(op(a, b), fused_activation);
}
}
// fallback
else
{
for (size_t d0 = 0; d0 < out_shape[0]; d0++)
{
for (size_t d1 = 0; d1 < out_shape[1]; d1++)
{
for (size_t d2 = 0; d2 < out_shape[2]; d2++)
{
for (size_t d3 = 0; d3 < out_shape[3]; d3++)
{
TShape in_off = { d0, d1, d2, d3 };
const auto in_a_off = kernels::detail::get_reduced_offset(in_off, in_a_shape);
const auto in_b_off = kernels::detail::get_reduced_offset(in_off, in_b_shape);
const auto a = input_a[offset(in_a_shape, in_a_off)];
const auto b = input_b[offset(in_b_shape, in_b_off)];
output[offset(out_shape, in_off)] = kernels::detail::apply_activation(op(a, b), fused_activation);
}
}
}
}
}
}
template <class TOp, class TShape>
void quantized_binary(const uint8_t *input_a, const uint8_t *input_b, uint8_t *output, const TShape &in_a_shape,
const TShape &in_b_shape, const TShape &out_shape, int32_t input_a_offset, int32_t input_a_mul, int32_t input_a_shift,
int32_t input_b_offset, int32_t input_b_mul, int32_t input_b_shift, int32_t output_mul, int32_t output_shift, int32_t output_offset, TOp &&op)
{
// opt. no broadcast
if (in_a_shape == in_b_shape)
{
auto size = kernels::detail::compute_size(in_a_shape);
for (size_t i = 0; i < size; i++)
{
auto a = (int32_t)input_a[i];
auto b = (int32_t)input_b[i];
a = runtime::mul_and_carry_shift(a + input_a_offset, input_a_mul, input_a_shift);
b = runtime::mul_and_carry_shift(b + input_b_offset, input_b_mul, input_b_shift);
auto output_val = runtime::mul_and_carry_shift(op(a, b), output_mul, output_shift);
output[i] = (uint8_t)std::clamp(output_val + output_offset, 0, 255);
}
}
// fallback
else
{
for (int32_t d0 = 0; d0 < out_shape[0]; d0++)
{
for (int32_t d1 = 0; d1 < out_shape[1]; d1++)
{
for (int32_t d2 = 0; d2 < out_shape[2]; d2++)
{
for (int32_t d3 = 0; d3 < out_shape[3]; d3++)
{
TShape in_off = { d0, d1, d2, d3 };
const auto in_a_off = kernels::detail::get_reduced_offset(in_off, in_a_shape);
const auto in_b_off = kernels::detail::get_reduced_offset(in_off, in_b_shape);
auto a = (int32_t)input_a[offset(in_a_shape, in_a_off)];
auto b = (int32_t)input_b[offset(in_b_shape, in_b_off)];
a = runtime::mul_and_carry_shift(a + input_a_offset, input_a_mul, input_a_shift);
b = runtime::mul_and_carry_shift(b + input_b_offset, input_b_mul, input_b_shift);
auto output_val = runtime::mul_and_carry_shift(op(a, b), output_mul, output_shift);
output[offset(out_shape, in_off)] = (uint8_t)std::clamp(output_val + output_offset, 0, 255);
}
}
}
}
}
}
template <class TRange, class TPtrGetter = detail::default_ptr_getter<uint8_t, TRange>>
inline void concat(xtl::span<TRange> inputs, uint8_t *output, xtl::span<const int32_t> concat_dims, size_t inner_size, size_t outer_size, TPtrGetter getter = {})
{
for (size_t oc = 0; oc < outer_size; oc++)
{
for (size_t i = 0; i < inputs.size(); i++)
{
auto size = inner_size * concat_dims[i];
auto src = getter(inputs[i]) + oc * size;
std::copy(src, src + size, output);
output += size;
}
}
}
template <class TShape>
void conv2d(const float *input, float *output, const float *weights, const float *bias, const TShape &in_shape,
int32_t groups, int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
{
const auto out_h = detail::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
const auto out_w = detail::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
const auto g_ic = in_shape[1] / groups;
const auto g_oc = (size_t)out_channels / groups;
for (size_t batch = 0; batch < in_shape[0]; batch++)
{
const float *in_batch_p = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
for (size_t og = 0; og < (size_t)groups; og++)
{
const float *in_group_p = in_batch_p + (size_t)og * g_ic * in_shape[2] * in_shape[3];
const float *w_group_p = weights + (size_t)og * g_oc * g_ic * filter_h * filter_w;
for (size_t oc = 0; oc < g_oc; oc++)
{
const float *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
for (size_t oy = 0; oy < out_h; oy++)
{
for (size_t ox = 0; ox < out_w; ox++)
{
const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
const size_t filter_y_start = (size_t)std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
const size_t filter_y_end = (size_t)std::min(filter_h, ((int32_t)in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
const size_t filter_x_start = (size_t)std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
const size_t filter_x_end = (size_t)std::min(filter_w, ((int32_t)in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
float value = bias[og * g_oc + oc];
for (size_t ic = 0; ic < g_ic; ic++)
{
const float *in_c_p = in_group_p + (size_t)ic * in_shape[2] * in_shape[3];
const float *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
for (size_t ky = filter_y_start; ky < filter_y_end; ky++)
{
for (size_t kx = filter_x_start; kx < filter_x_end; kx++)
{
const size_t in_y = in_y_origin + dilation_h * ky;
const size_t in_x = in_x_origin + dilation_w * kx;
const float in_v = in_c_p[in_y * in_shape[3] + in_x];
const float w = w_ic_p[ky * filter_w + kx];
value += in_v * w;
}
}
}
*output++ = detail::apply_activation(value, fused_activation);
}
}
}
}
}
}
template <class TShape>
void quantized_conv2d(const uint8_t *input, uint8_t *output, const uint8_t *weights, const int32_t *bias, int32_t input_offset, int32_t filter_offset,
int32_t output_mul, int32_t output_shift, int32_t output_offset, const TShape &in_shape, int32_t groups, int32_t out_channels,
int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
const padding &padding_h, const padding &padding_w)
{
const auto out_h = detail::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
const auto out_w = detail::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
const auto g_ic = in_shape[1] / groups;
const auto g_oc = out_channels / groups;
for (int32_t batch = 0; batch < in_shape[0]; batch++)
{
const uint8_t *in_batch_p = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
for (int32_t og = 0; og < groups; og++)
{
const uint8_t *in_group_p = in_batch_p + (size_t)og * g_ic * in_shape[2] * in_shape[3];
const uint8_t *w_group_p = weights + (size_t)og * g_oc * g_ic * filter_h * filter_w;
for (int32_t oc = 0; oc < g_oc; oc++)
{
const uint8_t *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
for (int32_t oy = 0; oy < out_h; oy++)
{
for (int32_t ox = 0; ox < out_w; ox++)
{
const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
const int32_t filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
int32_t value = bias[og * g_oc + oc];
for (int32_t ic = 0; ic < g_ic; ic++)
{
const uint8_t *in_c_p = in_group_p + (size_t)ic * in_shape[2] * in_shape[3];
const uint8_t *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
{
for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
{
const int32_t in_y = in_y_origin + dilation_h * ky;
const int32_t in_x = in_x_origin + dilation_w * kx;
const int32_t in_v = (int32_t)in_c_p[in_y * in_shape[3] + in_x] + input_offset;
const int32_t w = (int32_t)w_ic_p[ky * filter_w + kx] + filter_offset;
value += in_v * w;
}
}
}
auto output_val = static_cast<int32_t>(runtime::mul_and_carry_shift(value, output_mul, output_shift));
output_val += output_offset;
*output++ = (uint8_t)std::clamp(output_val, 0, 255);
}
}
}
}
}
}
template <class TShape>
void conv2d_transpose(const float *input, float *output, const float *weights, [[maybe_unused]] const float *bias, const TShape &in_shape,
int32_t groups, const TShape &out_shape, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
{
std::fill(output, output + kernels::detail::compute_size(out_shape), 0.f);
const auto g_ic = in_shape[1] / groups;
const auto g_oc = out_shape[1] / groups;
for (size_t batch = 0; batch < in_shape[0]; batch++)
{
float *out_batch_p = output + (size_t)batch * out_shape[1] * out_shape[2] * out_shape[3];
for (size_t g = 0; g < (size_t)groups; g++)
{
float *out_group_p = out_batch_p + (size_t)g * g_oc * out_shape[2] * out_shape[3];
const float *w_group_p = weights + (size_t)g * g_oc * g_ic * filter_h * filter_w;
for (size_t ic = 0; ic < g_ic; ic++)
{
for (size_t iy = 0; iy < in_shape[2]; iy++)
{
for (size_t ix = 0; ix < in_shape[3]; ix++)
{
const int32_t out_y_origin = (iy * stride_h) - padding_h.before;
const int32_t out_x_origin = (ix * stride_w) - padding_w.before;
const size_t filter_y_start = (size_t)std::max(0, (-out_y_origin + dilation_h - 1) / dilation_h);
const size_t filter_y_end = (size_t)std::min(filter_h, ((int32_t)out_shape[2] - out_y_origin + dilation_h - 1) / dilation_h);
const size_t filter_x_start = (size_t)std::max(0, (-out_x_origin + dilation_w - 1) / dilation_w);
const size_t filter_x_end = (size_t)std::min(filter_w, ((int32_t)out_shape[3] - out_x_origin + dilation_w - 1) / dilation_w);
const float in_v = *input++;
for (size_t oc = 0; oc < g_oc; oc++)
{
assert(bias[g * g_oc + oc] == 0.f);
float *out_c_p = out_group_p + (size_t)oc * out_shape[2] * out_shape[3];
const float *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
const float *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
for (size_t ky = filter_y_start; ky < filter_y_end; ky++)
{
for (size_t kx = filter_x_start; kx < filter_x_end; kx++)
{
const int32_t out_y = out_y_origin + dilation_h * ky;
const int32_t out_x = out_x_origin + dilation_w * kx;
const float w = w_ic_p[ky * filter_w + kx];
out_c_p[out_y * out_shape[3] + out_x] += in_v * w;
}
}
}
}
}
}
}
}
if (fused_activation != value_range<float>::full())
{
for (size_t i = 0; i < kernels::detail::compute_size(out_shape); i++)
output[i] = detail::apply_activation(output[i], fused_activation);
}
}
template <class TQ>
void dequantize(const TQ *CXX_RESTRICT input, float *CXX_RESTRICT output, size_t count, const quant_param_t &param)
{
#if __riscv
riscv_dequantize(input, output, count, param);
#else
for (size_t i = 0; i < count; i++)
{
output[i] = (input[i] - param.zero_point) * param.scale;
}
#endif
}
inline void matmul(const float *input_a, const float *input_b, float *output, const float *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, const value_range<float> &fused_activation)
{
for (int32_t oy = 0; oy < a_rows; oy++)
{
for (int32_t ox = 0; ox < b_cols; ox++)
{
float value = bias[ox];
for (int32_t i = 0; i < a_cols; i++)
{
const auto a = input_a[oy * a_cols + i];
const auto b = input_b[i * b_cols + ox];
value += a * b;
}
output[oy * b_cols + ox] = detail::apply_activation(value, fused_activation);
}
}
}
inline void quantized_matmul(const uint8_t *input_a, const uint8_t *input_b, uint8_t *output, const int32_t *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, int32_t input_a_offset, int32_t input_b_offset,
int32_t output_mul, int32_t output_shift, int32_t output_offset)
{
for (int32_t oy = 0; oy < a_rows; oy++)
{
for (int32_t ox = 0; ox < b_cols; ox++)
{
int32_t value = bias[ox];
for (int32_t i = 0; i < a_cols; i++)
{
const auto a = (int32_t)input_a[oy * a_cols + i] + input_a_offset;
const auto b = (int32_t)input_b[i * b_cols + ox] + input_b_offset;
value += a * b;
}
auto output_val = static_cast<int32_t>(runtime::mul_and_carry_shift(value, output_mul, output_shift));
output_val += output_offset;
output[oy * b_cols + ox] = (uint8_t)std::clamp(output_val, 0, 255);
}
}
}
template <class T, class TShape, class TPaddings>
void pad(const T *input, T *output, const TShape &in_shape, const TPaddings &paddings, T pad_value)
{
TShape out_shape = { in_shape[0] + paddings[0].sum(),
in_shape[1] + paddings[1].sum(),
in_shape[2] + paddings[2].sum(),
in_shape[3] + paddings[3].sum() };
for (int d0 = 0; d0 < out_shape[0]; d0++)
{
auto d0_origin = -paddings[0].before;
auto in0 = input + ((size_t)d0_origin + d0) * in_shape[1] * in_shape[2] * in_shape[3];
for (int d1 = 0; d1 < out_shape[1]; d1++)
{
auto d1_origin = -paddings[1].before;
auto in1 = in0 + ((size_t)d1_origin + d1) * in_shape[2] * in_shape[3];
for (int d2 = 0; d2 < out_shape[2]; d2++)
{
auto d2_origin = -paddings[2].before;
auto in2 = in1 + ((size_t)d2_origin + d2) * in_shape[3];
for (int d3 = 0; d3 < out_shape[3]; d3++)
{
auto d3_origin = -paddings[3].before;
if (d0 < paddings[0].before || d0 >= out_shape[0] - paddings[0].after
|| d1 < paddings[1].before || d1 >= out_shape[1] - paddings[1].after
|| d2 < paddings[2].before || d2 >= out_shape[2] - paddings[2].after
|| d3 < paddings[3].before || d3 >= out_shape[3] - paddings[3].after)
*output++ = pad_value;
else
*output++ = in2[d3_origin + d3];
}
}
}
}
}
template <class TQ>
void quantize(const float *CXX_RESTRICT input, TQ *CXX_RESTRICT output, size_t count, const quant_param_t &param)
{
#if __riscv
riscv_quantize(input, output, count, param);
#else
for (size_t i = 0; i < count; i++)
{
auto v = (int32_t)std::nearbyintf(input[i] / param.scale + param.zero_point);
output[i] = (TQ)std::clamp(v, (int32_t)std::numeric_limits<TQ>::lowest(), (int32_t)std::numeric_limits<TQ>::max());
}
#endif
}
template <class TReducer, class TShape>
void reduce(const float *input, float *output, float init_value, const TShape &in_shape, const TShape &reduced_shape, TReducer &&reducer)
{
std::fill(output, output + kernels::detail::compute_size(reduced_shape), init_value);
for (size_t d0 = 0; d0 < in_shape[0]; d0++)
{
for (size_t d1 = 0; d1 < in_shape[1]; d1++)
{
for (size_t d2 = 0; d2 < in_shape[2]; d2++)
{
for (size_t d3 = 0; d3 < in_shape[3]; d3++)
{
runtime_shape_t in_off = { d0, d1, d2, d3 };
auto out_off = kernels::detail::get_reduced_offset(in_off, reduced_shape);
const auto a = input[offset(in_shape, in_off)];
auto &b = output[offset(reduced_shape, out_off)];
b = reducer(b, a);
}
}
}
}
}
template <class TOp>
void unary(const float *CXX_RESTRICT input, float *CXX_RESTRICT output, size_t count, TOp &&op)
{
for (size_t i = 0; i < count; i++)
output[i] = op(input[i]);
}
template <class TBinaryOp, class TOutputOp, class TShape>
void reduce_window2d(const float *input, float *output, float init_value, const TShape &in_shape, int32_t filter_h, int32_t filter_w,
int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, const padding &padding_h, const padding &padding_w,
const value_range<float> &fused_activation, TBinaryOp &&binary_op, TOutputOp &&window_op)
{
const auto out_h = kernels::detail::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
const auto out_w = kernels::detail::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
runtime_shape_t out_shape { in_shape[0], in_shape[1], out_h, out_w };
for (size_t batch = 0; batch < in_shape[0]; batch++)
{
for (size_t oc = 0; oc < in_shape[1]; oc++)
{
for (size_t oy = 0; oy < out_h; oy++)
{
for (size_t ox = 0; ox < out_w; ox++)
{
const int32_t in_y_origin = ((int32_t)oy * stride_h) - padding_h.before;
const int32_t in_x_origin = ((int32_t)ox * stride_w) - padding_w.before;
const size_t filter_y_start = (size_t)std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
const size_t filter_y_end = (size_t)std::min(filter_h, ((int32_t)in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
const size_t filter_x_start = (size_t)std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
const size_t filter_x_end = (size_t)std::min(filter_w, ((int32_t)in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
float value = init_value;
int32_t kernel_count = 0;
for (size_t ky = filter_y_start; ky < filter_y_end; ky++)
{
for (size_t kx = filter_x_start; kx < filter_x_end; kx++)
{
const size_t in_y = in_y_origin + dilation_h * ky;
const size_t in_x = in_x_origin + dilation_w * kx;
const float in_v = input[offset(in_shape, { batch, oc, in_y, in_x })];
value = binary_op(value, in_v);
kernel_count++;
}
}
output[offset(out_shape, { batch, oc, oy, ox })] = kernels::detail::apply_activation(window_op(value, kernel_count), fused_activation);
}
}
}
}
}
template <class T, class TShape>
void resize_nearest_neighbor(const T *input, T *output, const TShape &in_shape, int32_t out_h, int32_t out_w)
{
auto height_scale = (float)in_shape[2] / out_h;
auto width_scale = (float)in_shape[3] / out_w;
for (size_t batch = 0; batch < in_shape[0]; batch++)
{
auto in_batch = input + batch * in_shape[1] * in_shape[2] * in_shape[3];
for (size_t oc = 0; oc < in_shape[1]; oc++)
{
auto in_c = in_batch + oc * in_shape[2] * in_shape[3];
for (size_t oy = 0; oy < (size_t)out_h; oy++)
{
auto in_y = std::min((size_t)floorf(oy * height_scale), in_shape[2] - 1);
auto in_row = in_c + in_y * in_shape[3];
for (size_t ox = 0; ox < (size_t)out_w; ox++)
{
auto in_x = std::min((size_t)floorf(ox * width_scale), in_shape[3] - 1);
*output++ = in_row[in_x];
}
}
}
}
}
template <class T, class TShape>
inline void resize_bilinear(const T *input, T *output, const TShape &in_shape, int32_t out_h, int32_t out_w, bool align_corners)
{
auto height_scale = (float)in_shape[2] / out_h;
auto width_scale = (float)in_shape[3] / out_w;
if (align_corners && out_h > 1)
height_scale = (float)(in_shape[2] - 1) / (out_h - 1);
if (align_corners && out_w > 1)
width_scale = (float)(in_shape[3] - 1) / (out_w - 1);
auto destIdx = 0;
for (size_t batch = 0; batch < in_shape[0]; batch++)
{
auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
for (size_t oc = 0; oc < in_shape[1]; oc++)
{
auto in_c = in_batch + (size_t)oc * in_shape[2] * in_shape[3];
for (size_t oy = 0; oy < (size_t)out_h; oy++)
{
auto in_y = oy * height_scale;
auto in_y0 = (size_t)floorf(in_y);
auto in_y1 = std::min(in_y0 + 1, in_shape[2] - 1);
for (size_t ox = 0; ox < (size_t)out_w; ox++)
{
auto in_x = ox * width_scale;
auto in_x0 = (size_t)floorf(in_x);
auto in_x1 = std::min(in_x0 + 1, in_shape[3] - 1);
auto v0 = in_c[in_y0 * in_shape[3] + in_x0];
auto v1 = in_c[in_y1 * in_shape[3] + in_x0];
auto v2 = in_c[in_y0 * in_shape[3] + in_x1];
auto v3 = in_c[in_y1 * in_shape[3] + in_x1];
auto a0 = (1 - (in_y - in_y0)) * (1 - (in_x - in_x0));
auto a1 = (in_y - in_y0) * (1 - (in_x - in_x0));
auto a2 = (1 - (in_y - in_y0)) * (in_x - in_x0);
auto a3 = (in_y - in_y0) * (in_x - in_x0);
output[destIdx++] = T(v0 * a0 + v1 * a1 + v2 * a2 + v3 * a3);
}
}
}
}
}
inline void softmax(const float *input, float *output, float beta, int32_t outer_size, size_t inner_size)
{
for (int32_t batch = 0; batch < outer_size; batch++)
{
auto src = input + batch * inner_size;
auto dest = output + batch * inner_size;
auto max = *std::max_element(src, src + inner_size);
float sum = 0;
for (size_t i = 0; i < inner_size; i++)
{
auto value = expf((src[i] - max) * beta);
sum += value;
dest[i] = value;
}
for (size_t i = 0; i < inner_size; i++)
dest[i] /= sum;
}
}
template <class T, class TShape>
void transpose(const T *CXX_RESTRICT input, T *CXX_RESTRICT output, const TShape &in_shape, const TShape &in_strides, const TShape &out_strides, const TShape &perm)
{
runtime_shape_t out_shape(in_shape.size());
for (size_t i = 0; i < in_shape.size(); i++)
out_shape[i] = in_shape[perm[i]];
runtime_shape_t i(4), o(4);
for (o[3] = 0; o[3] < out_shape[3]; o[3]++)
{
i[perm[3]] = o[3];
for (o[2] = 0; o[2] < out_shape[2]; o[2]++)
{
i[perm[2]] = o[2];
for (o[1] = 0; o[1] < out_shape[1]; o[1]++)
{
i[perm[1]] = o[1];
for (o[0] = 0; o[0] < out_shape[0]; o[0]++)
{
i[perm[0]] = o[0];
output[offset(out_strides, o)] = input[offset(in_strides, i)];
}
}
}
}
}
template <class T, class TShape>
void strided_slice(const T *CXX_RESTRICT input, T *CXX_RESTRICT output, const TShape &in_shape, const TShape &begin, const TShape &end, const TShape &strides)
{
auto loop_cond = [](int32_t i, int32_t stop, int32_t stride) {
return stride > 0 ? i < stop : i > stop;
};
for (int32_t d0 = begin[0]; loop_cond(d0, end[0], strides[0]); d0 += strides[0])
{
auto d0_origin = input + (size_t)d0 * in_shape[1] * in_shape[2] * in_shape[3];
for (int d1 = begin[1]; loop_cond(d1, end[1], strides[1]); d1 += strides[1])
{
auto d1_origin = d0_origin + (size_t)d1 * in_shape[2] * in_shape[3];
for (int32_t d2 = begin[2]; loop_cond(d2, end[2], strides[2]); d2 += strides[2])
{
auto d2_origin = d1_origin + (size_t)d2 * in_shape[3];
for (int32_t d3 = begin[3]; loop_cond(d3, end[3], strides[3]); d3 += strides[3])
*output++ = d2_origin[d3];
}
}
}
}
inline void nnil_unary_method(const float *input, float *output, size_t count, gsl::span<const gsl::byte> body)
{
using namespace nncase::runtime;
for (size_t i = 0; i < count; i++)
{
nnil_evalstack stack;
span_reader sr(body);
nnil_reader reader(sr);
bool ret = false;
while (reader.avail() && !ret)
{
auto op = reader.next();
switch (op.opcode)
{
case nnil_nop:
break;
case nnil_dup:
stack.dup();
break;
case nnil_pop:
stack.pop();
break;
case nnil_lda_0:
stack.push(input[i]);
break;
case nnil_ldc_r4_0:
stack.push(0.f);
break;
case nnil_ldc_r4_1:
stack.push(1.f);
break;
case nnil_ldc_r4:
stack.push(op.ldc_r4.r4);
break;
case nnil_abs:
stack.push(fabsf(stack.pop()));
break;
case nnil_ceil:
stack.push(ceilf(stack.pop()));
break;
case nnil_cos:
stack.push(cosf(stack.pop()));
break;
case nnil_exp:
stack.push(expf(stack.pop()));
break;
case nnil_floor:
stack.push(floorf(stack.pop()));
break;
case nnil_log:
stack.push(logf(stack.pop()));
break;
case nnil_neg:
stack.push(-stack.pop());
break;
case nnil_rsqrt:
stack.push(1.f / sqrtf(stack.pop()));
break;
case nnil_sin:
stack.push(sinf(stack.pop()));
break;
case nnil_square:
{
auto v = stack.pop();
stack.push(v * v);
break;
}
case nnil_add:
{
auto b = stack.pop();
auto a = stack.pop();
stack.push(a + b);
break;
}
case nnil_sub:
{
auto b = stack.pop();
auto a = stack.pop();
stack.push(a - b);
break;
}
case nnil_mul:
{
auto b = stack.pop();
auto a = stack.pop();
stack.push(a * b);
break;
}
case nnil_div:
{
auto b = stack.pop();
auto a = stack.pop();
stack.push(a / b);
break;
}
case nnil_min:
{
auto b = stack.pop();
auto a = stack.pop();
stack.push(std::min(a, b));
break;
}
case nnil_max:
{
auto b = stack.pop();
auto a = stack.pop();
stack.push(std::max(a, b));
break;
}
case nnil_clamp:
{
auto high = stack.pop();
auto low = stack.pop();
auto v = stack.pop();
stack.push(std::clamp(v, low, high));
break;
}
case nnil_ret:
output[i] = stack.pop();
ret = true;
break;
default:
throw std::runtime_error("Invalid nnil op");
}
}
}
}
inline void table_lookup1d(const uint8_t *CXX_RESTRICT input, uint8_t *CXX_RESTRICT output, size_t size, const uint8_t *CXX_RESTRICT table)
{
for (size_t i = 0; i < size; i++)
output[i] = table[input[i]];
}
}

View File

@ -0,0 +1,25 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nncase/runtime/datatypes.h>
#include <nncase/runtime/error.h>
#include <nncase/runtime/result.h>
#include <nncase/kernels/kernel_context.h>
BEGIN_NS_NNCASE_KERNELS
NNCASE_API result<void> nnil_unary_method(const float *input, float *output, size_t count, gsl::span<const gsl::byte> body, kernel_context &context = default_kernel_context) noexcept;
END_NS_NNCASE_KERNELS

View File

@ -0,0 +1,26 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nncase/runtime/datatypes.h>
#include <nncase/runtime/error.h>
#include <nncase/runtime/result.h>
BEGIN_NS_NNCASE_KERNELS
NNCASE_API result<void> reduce_window2d(reduce_op_t op, const float *input, float init_value, float *output, const runtime_shape_t &in_shape,
const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const padding &padding_h, const padding &padding_w,
int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, value_range<float> fused_activation, kernel_context &context = default_kernel_context) noexcept;
END_NS_NNCASE_KERNELS

View File

@ -0,0 +1,83 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "../kernel_utils.h"
#include <cmath>
#include <runtime/runtime_op_utility.h>
#include <xtl/xspan.hpp>
namespace nncase
{
namespace kernels
{
namespace neutral
{
template <class TQ>
void riscv_dequantize(const TQ *CXX_RESTRICT input, float *CXX_RESTRICT output, size_t count, const quant_param_t &param)
{
float scale = 1.f / param.scale;
float zero = -param.zero_point * scale;
for (size_t i = 0; i < count / 2; i++)
{
// handwritten pipeline for in order CPU
auto in1_q = input[i * 2];
auto in2_q = input[i * 2 + 1];
auto in1 = (float)in1_q;
auto in2 = (float)in2_q;
auto out1 = in1 * scale + zero;
auto out2 = in2 * scale + zero;
output[i * 2] = out1;
output[i * 2 + 1] = out2;
}
if (count % 2)
output[count - 1] = input[count - 1] * scale + zero;
}
template <class TQ>
void riscv_quantize(const float *CXX_RESTRICT input, TQ *CXX_RESTRICT output, size_t count, const quant_param_t &param)
{
float scale = param.scale;
float zero = param.zero_point;
for (size_t i = 0; i < count / 2; i++)
{
auto in1 = input[i * 2];
auto in2 = input[i * 2 + 1];
in1 = in1 * scale + zero;
in2 = in2 * scale + zero;
int32_t out1, out2;
asm volatile("fcvt.w.s %0, %1, rne"
: "=r"(out1)
: "f"(in1));
asm volatile("fcvt.w.s %0, %1, rne"
: "=r"(out2)
: "f"(in2));
output[i * 2] = std::clamp(out1, (int32_t)std::numeric_limits<TQ>::lowest(), (int32_t)std::numeric_limits<TQ>::max());
output[i * 2 + 1] = std::clamp(out2, (int32_t)std::numeric_limits<TQ>::lowest(), (int32_t)std::numeric_limits<TQ>::max());
}
if (count % 2)
{
auto in = (int32_t)roundf(input[count - 1] * scale + zero);
output[count - 1] = std::clamp(in, (int32_t)std::numeric_limits<TQ>::lowest(), (int32_t)std::numeric_limits<TQ>::max());
}
}
}
}
}

View File

@ -0,0 +1,72 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nncase/runtime/datatypes.h>
#include <nncase/runtime/error.h>
#include <nncase/runtime/result.h>
#include <nncase/kernels/kernel_context.h>
BEGIN_NS_NNCASE_KERNELS
NNCASE_API result<void> batch_to_space(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
const runtime_shape_t &block_shape, const runtime_paddings_t &crops, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides,
kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> broadcast(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
const runtime_shape_t &in_strides, const runtime_shape_t &out_shape, const runtime_shape_t &out_strides, kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> concat(datatype_t type, gsl::span<const gsl::byte *const> inputs, gsl::byte *output, const runtime_shape_t &out_shape,
gsl::span<const runtime_shape_t> in_strides, const runtime_shape_t &out_strides, size_t axis, const runtime_shape_t &concat_dims,
kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> convert(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> copy(datatype_t type, const gsl::byte *src, gsl::byte *dest,
const runtime_shape_t &shape, const runtime_shape_t &src_strides, const runtime_shape_t &dest_strides, kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> transpose(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
const runtime_shape_t &perm, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> binary(binary_op_t op, const float *input_a, const float *input_b, float *output,
const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
const runtime_shape_t &in_b_strides, const runtime_shape_t &out_strides, value_range<float> fused_activation, kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> dequantize(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, float scale, float bias,
kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> lut1d(datatype_t type, const gsl::byte *input, const gsl::byte *table, gsl::byte *output, const runtime_shape_t &shape,
const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const scalar &min, const scalar &max) noexcept;
NNCASE_API result<void> pad(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const runtime_paddings_t &paddings, pad_mode_t mode, const scalar &pad_value,
kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> quantize(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, float scale, float bias,
kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> unary(unary_op_t op, const float *input, float *output, const runtime_shape_t &shape,
const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> reduce(reduce_op_t op, float init_value, const float *input, float *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context = default_kernel_context) noexcept;
NNCASE_API result<void> slice(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const runtime_shape_t &begins, const runtime_shape_t &ends, const runtime_axis_t &strides,
kernel_context &context = default_kernel_context) noexcept;
END_NS_NNCASE_KERNELS

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,34 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <memory>
#include <nncase/runtime/compiler_defs.h>
BEGIN_NS_NNCASE_RUNTIME
class NNCASE_API allocation_state
{
public:
virtual ~allocation_state();
};
class NNCASE_API host_allocator
{
public:
virtual ~host_allocator();
virtual gsl::span<gsl::byte> allocate(allocation_state &state, size_t bytes) = 0;
};
END_NS_NNCASE_RUNTIME

View File

@ -0,0 +1,353 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cmath>
#include <cstdint>
#include <float.h>
#include <functional>
#include <limits>
#include <nncase/runtime/compiler_defs.h>
namespace nncase
{
struct half
{
uint16_t value;
};
struct from_raw_t
{
explicit from_raw_t() = default;
};
NNCASE_INLINE_VAR constexpr from_raw_t from_raw {};
struct bfloat16
{
private:
union fp32
{
uint32_t u32;
float f32;
uint16_t u16() const noexcept
{
constexpr size_t index = NNCASE_LITTLE_ENDIAN ? 1 : 0;
return reinterpret_cast<const uint16_t *>(&u32)[index];
}
uint16_t &u16() noexcept
{
constexpr size_t index = NNCASE_LITTLE_ENDIAN ? 1 : 0;
return reinterpret_cast<uint16_t *>(&u32)[index];
}
};
// A value that represents "zero".
static constexpr uint16_t ZERO_VALUE = 0;
// A value that represents "not a number".
static constexpr uint16_t NAN_VALUE = 0x7FC0;
public:
bfloat16() noexcept = default;
explicit bfloat16(float v) noexcept
: value_(truncate_to_bfloat16(v).value_) { }
template <class T, class = std::enable_if_t<std::is_integral<T>::value || std::is_floating_point<T>::value>>
explicit bfloat16(const T &val) noexcept
: bfloat16(static_cast<float>(val)) { }
constexpr bfloat16(from_raw_t, uint16_t value) noexcept
: value_(value) { }
operator float() const noexcept
{
fp32 result;
result.u32 = 0;
result.u16() = value_;
return result.f32;
}
const uint16_t &raw() const noexcept { return value_; }
uint16_t &raw() noexcept { return value_; }
static constexpr bfloat16 from_raw(uint16_t v) noexcept
{
return bfloat16(nncase::from_raw, v);
}
static bfloat16 truncate_to_bfloat16(const float v) noexcept
{
bfloat16 output;
if (!std::isnan(v))
{
fp32 f;
f.f32 = v;
output.value_ = f.u16();
}
else
{
output.value_ = NAN_VALUE;
}
return output;
}
// Converts a float point to bfloat16, with round-nearest-to-even as rounding
// method.
static bfloat16 round_to_bfloat16(float v)
{
uint32_t input;
fp32 f;
f.f32 = v;
input = f.u32;
bfloat16 output;
if (!std::isnan(v))
{
// Least significant bit of resulting bfloat.
uint32_t lsb = (input >> 16) & 1;
uint32_t rounding_bias = 0x7fff + lsb;
input += rounding_bias;
output.value_ = static_cast<uint16_t>(input >> 16);
}
else
{
// If the value is a NaN, squash it to a qNaN with msb of fraction set,
// this makes sure after truncation we don't end up with an inf.
//
// qNaN magic: All exponent bits set + most significant bit of fraction
// set.
output.value_ = NAN_VALUE;
}
return output;
}
static constexpr bfloat16 epsilon() noexcept
{
// 0x1.0p-7
return from_raw(0x3c00);
}
static constexpr bfloat16 highest() noexcept
{
// 0x1.FEp127
return from_raw(0x7F7F);
}
static constexpr bfloat16 min() noexcept
{
// 0x1p-126
return from_raw(0x0080);
}
static constexpr bfloat16 lowest() noexcept
{
// -0x1.FEp127
return from_raw(0xFF7F);
}
static constexpr bfloat16 nan() noexcept
{
return from_raw(NAN_VALUE);
}
static constexpr bfloat16 infinity() noexcept
{
return from_raw(0x7f80);
}
constexpr bool zero() const noexcept { return (value_ & 0x7FFF) == ZERO_VALUE; }
private:
uint16_t value_;
};
#define DEFINE_BF16_BINARY_BF16RET(x) \
inline bfloat16 operator x(bfloat16 a, bfloat16 b) noexcept \
{ \
return bfloat16::round_to_bfloat16(float(a) x float(b)); \
}
#define DEFINE_BF16_BINARY_BOOLRET(x) \
inline bool operator x(bfloat16 a, bfloat16 b) noexcept \
{ \
return float(a) x float(b); \
}
DEFINE_BF16_BINARY_BF16RET(+)
DEFINE_BF16_BINARY_BF16RET(-)
DEFINE_BF16_BINARY_BF16RET(*)
DEFINE_BF16_BINARY_BF16RET(/)
DEFINE_BF16_BINARY_BOOLRET(<)
DEFINE_BF16_BINARY_BOOLRET(<=)
DEFINE_BF16_BINARY_BOOLRET(>=)
DEFINE_BF16_BINARY_BOOLRET(>)
#define DEFINE_BF16_BINARY_SELF_MOD(x, op) \
inline bfloat16 &operator x(bfloat16 &a, bfloat16 b) noexcept \
{ \
a = a op b; \
return a; \
}
DEFINE_BF16_BINARY_SELF_MOD(+=, +)
DEFINE_BF16_BINARY_SELF_MOD(-=, -)
DEFINE_BF16_BINARY_SELF_MOD(*=, *)
DEFINE_BF16_BINARY_SELF_MOD(/=, /)
inline bfloat16 operator-(bfloat16 a) noexcept
{
return bfloat16::round_to_bfloat16(-float(a));
}
inline bool operator==(const bfloat16 &lhs, const bfloat16 &rhs) noexcept
{
return lhs.raw() == rhs.raw();
}
inline bool operator!=(const bfloat16 &lhs, const bfloat16 &rhs) noexcept
{
return lhs.raw() != rhs.raw();
}
}
namespace std
{
template <>
struct hash<nncase::bfloat16>
{
size_t operator()(const nncase::bfloat16 &v) const
{
return hash<float>()(static_cast<float>(v));
}
};
template <>
struct numeric_limits<nncase::bfloat16>
{
static constexpr float_denorm_style has_denorm = denorm_present;
static constexpr bool has_infinity = true;
static constexpr bool has_quiet_NaN = true;
static constexpr bool has_signaling_NaN = true;
static constexpr bool is_bounded = true;
static constexpr bool is_iec559 = true;
static constexpr bool is_signed = true;
static constexpr bool is_specialized = true;
static constexpr float_round_style round_style = round_to_nearest;
static constexpr int radix = FLT_RADIX;
[[nodiscard]] static constexpr nncase::bfloat16(min)() noexcept
{
return nncase::bfloat16::min();
}
[[nodiscard]] static constexpr nncase::bfloat16(max)() noexcept
{
return nncase::bfloat16::highest();
}
[[nodiscard]] static constexpr nncase::bfloat16 lowest() noexcept
{
return nncase::bfloat16::lowest();
}
[[nodiscard]] static constexpr nncase::bfloat16 epsilon() noexcept
{
return nncase::bfloat16::epsilon();
}
[[nodiscard]] static constexpr nncase::bfloat16 round_error() noexcept
{
// 0.5
return nncase::bfloat16::from_raw(0x3f00);
}
[[nodiscard]] static constexpr nncase::bfloat16 denorm_min() noexcept
{
return nncase::bfloat16::min();
}
[[nodiscard]] static constexpr nncase::bfloat16 infinity() noexcept
{
return nncase::bfloat16::infinity();
}
[[nodiscard]] static constexpr nncase::bfloat16 quiet_NaN() noexcept
{
return nncase::bfloat16::nan();
}
[[nodiscard]] static constexpr nncase::bfloat16 signaling_NaN() noexcept
{
return nncase::bfloat16::nan();
}
static constexpr int digits = 8;
static constexpr int max_exponent = FLT_MAX_EXP;
static constexpr int min_exponent = FLT_MIN_EXP;
};
using nncase::bfloat16;
inline bool isinf(const bfloat16 &a) { return std::isinf(float(a)); }
inline bool isnan(const bfloat16 &a) { return std::isnan(float(a)); }
inline bool isfinite(const bfloat16 &a) { return std::isfinite(float(a)); }
inline bfloat16 abs(const bfloat16 &a) { return bfloat16::round_to_bfloat16(fabsf(float(a))); }
inline bfloat16 exp(const bfloat16 &a) { return bfloat16::round_to_bfloat16(expf(float(a))); }
inline bfloat16 log(const bfloat16 &a) { return bfloat16::round_to_bfloat16(logf(float(a))); }
inline bfloat16 log10(const bfloat16 &a)
{
return bfloat16::round_to_bfloat16(log10f(float(a)));
}
inline bfloat16 sqrt(const bfloat16 &a)
{
return bfloat16::round_to_bfloat16(sqrtf(float(a)));
}
inline bfloat16 pow(const bfloat16 &a, const bfloat16 &b)
{
return bfloat16::round_to_bfloat16(powf(float(a), float(b)));
}
inline bfloat16 sin(const bfloat16 &a) { return bfloat16::round_to_bfloat16(sinf(float(a))); }
inline bfloat16 cos(const bfloat16 &a) { return bfloat16::round_to_bfloat16(cosf(float(a))); }
inline bfloat16 tan(const bfloat16 &a) { return bfloat16::round_to_bfloat16(tanf(float(a))); }
inline bfloat16 tanh(const bfloat16 &a)
{
return bfloat16::round_to_bfloat16(tanhf(float(a)));
}
inline bfloat16 floor(const bfloat16 &a)
{
return bfloat16::round_to_bfloat16(floorf(float(a)));
}
inline bfloat16 ceil(const bfloat16 &a)
{
return bfloat16::round_to_bfloat16(ceilf(float(a)));
}
inline bfloat16 round(const bfloat16 &a)
{
return bfloat16::round_to_bfloat16(roundf(float(a)));
}
inline bfloat16 nearbyint(const bfloat16 &a)
{
return bfloat16::round_to_bfloat16(nearbyintf(float(a)));
}
inline long lrint(const bfloat16 &a)
{
return lrintf(float(a));
}
} // namespace std

View File

@ -0,0 +1,167 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "datatypes.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstring>
#include <span>
namespace nncase::runtime
{
class bitreader
{
public:
bitreader(std::span<const uint8_t> data)
: data_(data), buffer_(0), avail_(0) { }
void read(uint8_t *dest, size_t bits)
{
while (bits)
{
auto to_read = std::min(bits, size_t(8));
*dest++ = read_bits_le8(to_read);
bits -= to_read;
}
}
template <class T, size_t Bits>
T read()
{
T ret {};
read(reinterpret_cast<uint8_t *>(&ret), Bits);
return ret;
}
private:
uint8_t read_bits_le8(size_t bits)
{
assert(bits <= 8);
fill_buffer_le8(bits);
uint8_t ret = buffer_ & ((size_t(1) << bits) - 1);
buffer_ >>= bits;
avail_ -= bits;
return ret;
}
void fill_buffer_le8(size_t bits)
{
if (avail_ < bits)
{
auto max_read_bytes = std::min(data_.size() * 8, sizeof(buffer_) * 8 - avail_) / 8;
assert(max_read_bytes != 0);
uint64_t tmp = 0;
std::memcpy(&tmp, data_.data(), max_read_bytes);
data_ = data_.subspan(max_read_bytes);
buffer_ = buffer_ | (tmp << avail_);
avail_ += max_read_bytes * 8;
}
}
private:
std::span<const uint8_t> data_;
uint64_t buffer_;
size_t avail_;
};
class bitwriter
{
public:
bitwriter(std::span<uint8_t> data, size_t bitoffset = 0)
: data_(data), buffer_(0), avail_(sizeof(buffer_) * 8)
{
if (bitoffset)
{
data_ = data_.subspan(bitoffset / 8);
bitoffset %= 8;
buffer_ = data_.front() & ((size_t(1) << bitoffset) - 1);
avail_ -= bitoffset;
}
}
~bitwriter() { flush(); }
void write(const uint8_t *src, size_t bits)
{
while (bits)
{
auto to_write = std::min(bits, size_t(8));
write_bits_le8(*src++, to_write);
bits -= to_write;
}
}
template <size_t Bits, class T>
void write(T value)
{
write(reinterpret_cast<const uint8_t *>(&value), Bits);
}
void flush()
{
auto write_bytes = (buffer_written_bits() + 7) / 8;
if (write_bytes)
{
assert(data_.size() >= write_bytes);
std::memcpy(data_.data(), &buffer_, write_bytes);
data_ = data_.subspan(write_bytes);
buffer_ = 0;
avail_ = sizeof(buffer_) * 8;
}
}
private:
void write_bits_le8(uint8_t value, size_t bits)
{
assert(bits <= 8);
reserve_buffer_8();
size_t new_value = value & ((size_t(1) << bits) - 1);
buffer_ = buffer_ | (new_value << buffer_written_bits());
avail_ -= bits;
}
void reserve_buffer_8()
{
if (avail_ < 8)
{
auto write_bytes = buffer_written_bits() / 8;
assert(data_.size() >= write_bytes);
std::memcpy(data_.data(), &buffer_, write_bytes);
data_ = data_.subspan(write_bytes);
if (write_bytes == sizeof(buffer_))
buffer_ = 0;
else
buffer_ >>= write_bytes * 8;
avail_ += write_bytes * 8;
}
}
size_t buffer_written_bits() const noexcept
{
return sizeof(buffer_) * 8 - avail_;
}
private:
std::span<uint8_t> data_;
uint64_t buffer_;
size_t avail_;
};
}

View File

@ -0,0 +1,107 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <gsl/gsl-lite.hpp>
#include <type_traits>
#if defined(_MSC_VER)
#ifdef NNCASE_DLL
#define NNCASE_API __declspec(dllexport)
#elif defined(NNCASE_SHARED_LIBS)
#define NNCASE_API __declspec(dllimport)
#else
#define NNCASE_API
#endif
#else
#define NNCASE_API
#endif
#if defined(_MSC_VER)
#define NNCASE_UNREACHABLE() __assume(0)
#else
#define NNCASE_UNREACHABLE() __builtin_unreachable()
#endif
#if gsl_CPP17_OR_GREATER
#define NNCASE_INLINE_VAR inline
#define NNCASE_UNUSED [[maybe_unused]]
namespace nncase
{
template <class Callable, class... Args>
using invoke_result_t = std::invoke_result_t<Callable, Args...>;
}
#else
#define NNCASE_INLINE_VAR
#if defined(_MSC_VER)
#define NNCASE_UNUSED
#else
#define NNCASE_UNUSED __attribute__((unused))
#endif
namespace nncase
{
template <class Callable, class... Args>
using invoke_result_t = std::result_of_t<Callable(Args...)>;
}
#endif
#define NNCASE_LITTLE_ENDIAN 1
#define NNCASE_HAVE_STD_BYTE gsl_CPP17_OR_GREATER
#define NNCASE_NODISCARD gsl_NODISCARD
#define NNCASE_NORETURN gsl_NORETURN
#define BEGIN_NS_NNCASE_RUNTIME \
namespace nncase \
{ \
namespace runtime \
{
#define END_NS_NNCASE_RUNTIME \
} \
}
#define BEGIN_NS_NNCASE_RT_STACKVM \
namespace nncase \
{ \
namespace runtime \
{ \
namespace stackvm \
{
#define END_NS_NNCASE_RT_STACKVM \
} \
} \
}
#define BEGIN_NS_NNCASE_KERNELS \
namespace nncase \
{ \
namespace kernels \
{
#define END_NS_NNCASE_KERNELS \
} \
}
#ifndef DEFINE_ENUM_BITMASK_OPERATORS
#define DEFINE_ENUM_BITMASK_OPERATORS(ENUMTYPE) gsl_DEFINE_ENUM_BITMASK_OPERATORS(ENUMTYPE)
#endif
namespace nncase
{
struct default_init_t
{
};
NNCASE_INLINE_VAR constexpr default_init_t default_init {};
}

View File

@ -0,0 +1,12 @@
DEFINE_DATATYPE(int8, int8_t, i8, 0x00)
DEFINE_DATATYPE(int16, int16_t, i16, 0x01)
DEFINE_DATATYPE(int32, int32_t, i32, 0x02)
DEFINE_DATATYPE(int64, int64_t, i64, 0x03)
DEFINE_DATATYPE(uint8, uint8_t, u8, 0x04)
DEFINE_DATATYPE(uint16, uint16_t, u16, 0x05)
DEFINE_DATATYPE(uint32, uint32_t, u32, 0x06)
DEFINE_DATATYPE(uint64, uint64_t, u64, 0x07)
DEFINE_DATATYPE(float16, half, f16, 0x08)
DEFINE_DATATYPE(float32, float, f32, 0x09)
DEFINE_DATATYPE(float64, double, f64, 0x0A)
DEFINE_DATATYPE(bfloat16, bfloat16, bf16, 0x0B)

View File

@ -0,0 +1,436 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "bfloat16.h"
#include "compiler_defs.h"
#include "small_vector.hpp"
#include <array>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <limits>
#include <numeric>
#include <stdexcept>
#include <string>
#include <type_traits>
#include <vector>
namespace nncase
{
typedef enum _datatype : uint8_t
{
#define DEFINE_DATATYPE(id, t, name, value) dt_##id = value,
#include "datatypes.def"
#undef DEFINE_DATATYPE
} datatype_t;
namespace detail
{
template <datatype_t Type>
struct datatype_to_cpp_type
{
};
template <class T>
struct cpp_type_to_datatype
{
};
#if NNCASE_HAVE_STD_BYTE
template <>
struct cpp_type_to_datatype<std::byte>
{
static constexpr datatype_t type = dt_uint8;
};
#endif
#define DEFINE_DATATYPE(id, t, name, value) \
template <> \
struct datatype_to_cpp_type<dt_##id> \
{ \
using type = t; \
}; \
template <> \
struct cpp_type_to_datatype<t> \
{ \
static constexpr datatype_t type = dt_##id; \
};
#include "datatypes.def"
#undef DEFINE_DATATYPE
inline constexpr size_t datatype_bytes(datatype_t type)
{
switch (type)
{
#define DEFINE_DATATYPE(id, t, name, value) \
case (dt_##id): \
return sizeof(t);
#include "datatypes.def"
#undef DEFINE_DATATYPE
default:
return -1;
}
}
}
template <class T>
constexpr datatype_t to_datatype() noexcept
{
return detail::cpp_type_to_datatype<T>::type;
}
template <datatype_t Type>
using to_cpp_type_t = typename detail::datatype_to_cpp_type<Type>::type;
struct padding
{
int32_t before;
int32_t after;
int32_t sum() const noexcept { return before + after; }
static padding zero() noexcept { return {}; }
};
template <class T>
struct value_range
{
T min;
T max;
static constexpr value_range<T> full() noexcept
{
if (std::is_floating_point<T>::value || std::is_same<T, bfloat16>::value)
return { -std::numeric_limits<T>::infinity(), std::numeric_limits<T>::infinity() };
else
return { std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max() };
}
static constexpr value_range<T> nonnegative() noexcept
{
return { 0, std::numeric_limits<T>::max() };
}
constexpr T length() const noexcept { return max - min; }
};
typedef enum _reduce_op
{
reduce_mean,
reduce_min,
reduce_max,
reduce_sum
} reduce_op_t;
typedef enum _binary_op
{
binary_add,
binary_sub,
binary_mul,
binary_div,
binary_min,
binary_max,
binary_pow,
binary_floor_div,
binary_floor_mod,
binary_bitwise_and,
binary_bitwise_or,
binary_bitwise_xor,
binary_logical_and,
binary_logical_or,
binary_logical_xor
} binary_op_t;
inline std::string binary_op_to_string(binary_op_t op)
{
switch (op)
{
case binary_add:
return "binary_add";
case binary_sub:
return "binary_sub";
case binary_mul:
return "binary_mul";
case binary_div:
return "binary_div";
case binary_min:
return "binary_min";
case binary_max:
return "binary_max";
case binary_pow:
return "binary_pow";
case binary_floor_div:
return "binary_floor_div";
case binary_floor_mod:
return "binary_floor_mod";
case binary_bitwise_and:
return "binary_bitwise_and";
case binary_bitwise_or:
return "binary_bitwise_or";
case binary_bitwise_xor:
return "binary_bitwise_xor";
case binary_logical_and:
return "binary_logical_and";
case binary_logical_or:
return "binary_logical_or";
case binary_logical_xor:
return "binary_logical_xor";
}
return "unknown";
}
typedef enum _unary_op
{
unary_abs,
unary_ceil,
unary_cos,
unary_exp,
unary_floor,
unary_log,
unary_neg,
unary_round,
unary_rsqrt,
unary_sin,
unary_sqrt,
unary_square,
unary_tanh,
unary_bitwise_not,
unary_logical_not
} unary_op_t;
inline std::string unary_op_to_string(unary_op_t op)
{
switch (op)
{
case unary_abs:
return "unary_abs";
case unary_ceil:
return "unary_ceil";
case unary_cos:
return "unary_cos";
case unary_exp:
return "unary_exp";
case unary_floor:
return "unary_floor";
case unary_log:
return "unary_log";
case unary_neg:
return "unary_neg";
case unary_round:
return "unary_round";
case unary_rsqrt:
return "unary_rsqrt";
case unary_sin:
return "unary_sin";
case unary_sqrt:
return "unary_sqrt";
case unary_square:
return "unary_square";
case unary_tanh:
return "unary_tanh";
case unary_bitwise_not:
return "unary_bitwise_not";
case unary_logical_not:
return "unary_logical_not";
}
return "unknown";
}
typedef enum _image_resize_mode
{
image_resize_bilinear,
image_resize_nearest_neighbor
} image_resize_mode_t;
typedef enum _pad_mode
{
pad_constant,
pad_reflect,
pad_symmetric,
pad_edge
} pad_mode_t;
typedef struct _quant_param
{
int32_t zero_point;
float scale;
template <class T>
constexpr value_range<float> range() const noexcept
{
return {
(std::numeric_limits<T>::lowest() - zero_point) * scale, (std::numeric_limits<T>::max() - zero_point) * scale
};
}
} quant_param_t;
inline bool operator==(const quant_param_t &lhs, const quant_param_t &rhs) noexcept
{
return lhs.zero_point == rhs.zero_point && lhs.scale == rhs.scale;
}
inline bool almost_equal(const quant_param_t &lhs, const quant_param_t &rhs) noexcept
{
return lhs.zero_point == rhs.zero_point
&& fabs(lhs.scale - rhs.scale) <= std::numeric_limits<float>::epsilon();
}
struct fixed_mul
{
float mul;
int8_t shift;
int32_t rounded_mul() const noexcept { return (int32_t)lrintf(mul); }
};
using memory_location_t = uint8_t;
NNCASE_INLINE_VAR constexpr memory_location_t mem_input = 0;
NNCASE_INLINE_VAR constexpr memory_location_t mem_output = 1;
NNCASE_INLINE_VAR constexpr memory_location_t mem_rdata = 2;
NNCASE_INLINE_VAR constexpr memory_location_t mem_data = 3;
using runtime_shape_t = itlib::small_vector<size_t, 4>;
using runtime_axis_t = itlib::small_vector<int32_t, 4>;
using runtime_paddings_t = itlib::small_vector<padding, 4>;
struct scalar
{
datatype_t type;
std::aligned_storage_t<8> storage;
scalar() = default;
scalar(int8_t value) noexcept
{
type = dt_int8;
as<int8_t>() = value;
}
scalar(int16_t value) noexcept
{
type = dt_int16;
as<int16_t>() = value;
}
scalar(int32_t value) noexcept
{
type = dt_int32;
as<int32_t>() = value;
}
scalar(uint8_t value) noexcept
{
type = dt_uint8;
as<uint8_t>() = value;
}
scalar(uint16_t value) noexcept
{
type = dt_uint16;
as<uint16_t>() = value;
}
scalar(uint32_t value) noexcept
{
type = dt_uint32;
as<uint32_t>() = value;
}
scalar(bfloat16 value) noexcept
{
type = dt_bfloat16;
as<bfloat16>() = value;
}
scalar(float value) noexcept
{
type = dt_float32;
as<float>() = value;
}
template <class T>
T &as() noexcept { return *reinterpret_cast<T *>(&storage); }
template <class T>
const T &as() const noexcept { return *reinterpret_cast<const T *>(&storage); }
};
struct memory_range
{
memory_location_t memory_location;
datatype_t datatype;
uint16_t reserved0;
uint32_t start;
uint32_t size;
};
NNCASE_INLINE_VAR constexpr size_t MAX_MODULE_TYPE_LENGTH = 16;
typedef std::array<char, MAX_MODULE_TYPE_LENGTH> module_type_t;
template <std::size_t N, std::size_t... Is>
constexpr module_type_t
to_module_type(const char (&a)[N], std::index_sequence<Is...>)
{
return { { a[Is]... } };
}
template <std::size_t N>
constexpr module_type_t to_module_type(const char (&a)[N])
{
return to_module_type(a, std::make_index_sequence<N>());
}
inline padding operator+(const padding &lhs, const padding &rhs) noexcept
{
return { lhs.before + rhs.before, lhs.after + rhs.after };
}
inline bool operator==(const padding &lhs, const padding &rhs) noexcept
{
return lhs.before == rhs.before && lhs.after == rhs.after;
}
inline bool operator!=(const padding &lhs, const padding &rhs) noexcept
{
return lhs.before != rhs.before || lhs.after != rhs.after;
}
template <class T>
bool operator==(const value_range<T> &lhs, const value_range<T> &rhs) noexcept
{
return lhs.min == rhs.min && lhs.max == rhs.max;
}
template <class T>
bool operator!=(const value_range<T> &lhs, const value_range<T> &rhs) noexcept
{
return lhs.min != rhs.min || lhs.max != rhs.max;
}
inline bool operator==(const scalar &lhs, const scalar &rhs) noexcept
{
auto valid_bytes = detail::datatype_bytes(lhs.type);
return lhs.type == rhs.type && !memcmp(&lhs.storage, &rhs.storage, valid_bytes);
}
inline bool operator!=(const scalar &lhs, const scalar &rhs) noexcept
{
auto valid_bytes = detail::datatype_bytes(lhs.type);
return lhs.type != rhs.type || memcmp(&lhs.storage, &rhs.storage, valid_bytes);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,48 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "compiler_defs.h"
#include <system_error>
BEGIN_NS_NNCASE_RUNTIME
enum class nncase_errc
{
invalid_model_indentifier = 0x01,
invalid_model_checksum = 0x02,
invalid_model_version = 0x03,
runtime_not_found = 0x04,
datatype_mismatch = 0x05,
shape_mismatch = 0x06,
invalid_memory_location = 0x07,
stackvm_illegal_instruction = 0x0100,
stackvm_illegal_target = 0x0101,
stackvm_stack_overflow = 0x0102,
stackvm_stack_underflow = 0x0103,
nnil_illegal_instruction = 0x0200,
};
NNCASE_API const std::error_category &nncase_category() noexcept;
NNCASE_API std::error_condition make_error_condition(nncase_errc code);
END_NS_NNCASE_RUNTIME
namespace std
{
template <>
struct is_error_condition_enum<nncase::runtime::nncase_errc> : true_type
{
};
}

View File

@ -0,0 +1,96 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "runtime_tensor_impl.h"
#include "shared_runtime_tensor.h"
BEGIN_NS_NNCASE_RUNTIME
namespace detail
{
enum class cache_status_t
{
valid,
need_invalidate,
need_write_back
};
struct host_memory_block
{
host_runtime_tensor::memory_pool_t pool;
uintptr_t virtual_address;
size_t size_bytes;
host_runtime_tensor::data_deleter_t deleter;
cache_status_t cache_status;
physical_memory_block physical_block;
host_memory_block() = default;
host_memory_block(const host_memory_block &) = delete;
host_memory_block(host_memory_block && other) noexcept;
host_memory_block &operator=(const host_memory_block &) = delete;
host_memory_block &operator=(host_memory_block && other) noexcept;
~host_memory_block()
{
free();
}
void free()
{
if (auto d = std::move(deleter))
d(reinterpret_cast<gsl::byte *>(virtual_address));
deleter = {};
}
gsl::span<gsl::byte> virtual_buffer() const noexcept
{
return { reinterpret_cast<gsl::byte *>(virtual_address), size_bytes };
}
};
class NNCASE_API host_runtime_tensor_impl : public runtime_tensor_impl
{
public:
host_runtime_tensor_impl(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, host_memory_block memory_block);
datatype_t datatype() const noexcept override;
const runtime_shape_t &shape() const noexcept override;
const runtime_shape_t &strides() const noexcept override;
runtime_tensor_type &tensor_type() const noexcept override;
bool can_copy_from_different_type(const runtime_tensor_impl &src) const noexcept override;
bool can_copy_to_different_type(const runtime_tensor_impl &dest) const noexcept override;
result<void> copy_to_same_type(runtime_tensor_impl &dest) noexcept override;
result<void> copy_from_different_type(runtime_tensor_impl &src) noexcept override;
result<void> copy_to_different_type(runtime_tensor_impl &dest) noexcept override;
result<void> copy_from_host(runtime_tensor_impl &src) noexcept override;
result<void> copy_to_host(runtime_tensor_impl &dest) noexcept override;
result<host_runtime_tensor::mapped_buffer> map(host_runtime_tensor::map_access_t access) noexcept;
result<void> unmap(host_runtime_tensor::map_access_t access) noexcept;
result<void> sync(host_runtime_tensor::sync_op_t op, bool force = false) noexcept;
const host_memory_block &memory_block() const noexcept { return memory_block_; }
host_memory_block &memory_block() noexcept { return memory_block_; }
private:
datatype_t datatype_;
runtime_shape_t shape_;
runtime_shape_t strides_;
host_memory_block memory_block_;
};
}
END_NS_NNCASE_RUNTIME

View File

@ -0,0 +1,81 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "allocator.h"
#include "model.h"
#include "result.h"
#include "runtime_module.h"
#include <gsl/gsl-lite.hpp>
#include <memory>
#include <unordered_map>
BEGIN_NS_NNCASE_RUNTIME
class NNCASE_API options_dict
{
public:
template <class T>
result<T> get(const char *name)
{
auto it = values_.find(name);
if (it != values_.end())
return ok(it->second.as<T>());
else
return err(std::errc::result_out_of_range);
}
template <class T>
result<void> set(const char *name, T value)
{
values_[name] = scalar(value);
return ok();
}
private:
std::unordered_map<const char *, scalar> values_;
};
class NNCASE_API interpreter
{
public:
interpreter() noexcept;
interpreter(interpreter &) = delete;
interpreter(interpreter &&) = default;
NNCASE_NODISCARD result<void> load_model(gsl::span<const gsl::byte> buffer) noexcept;
size_t inputs_size() const noexcept;
size_t outputs_size() const noexcept;
const memory_range &input_desc(size_t index) const noexcept;
const memory_range &output_desc(size_t index) const noexcept;
const runtime_shape_t &input_shape(size_t index) const noexcept;
const runtime_shape_t &output_shape(size_t index) const noexcept;
result<runtime_tensor> input_tensor(size_t index) noexcept;
result<void> input_tensor(size_t index, runtime_tensor tensor) noexcept;
result<runtime_tensor> output_tensor(size_t index) noexcept;
result<void> output_tensor(size_t index, runtime_tensor tensor) noexcept;
result<void> run() noexcept;
result<runtime_module *> find_module_by_id(size_t index) noexcept;
options_dict &options() noexcept;
private:
std::vector<std::unique_ptr<runtime_module>> modules_;
runtime_module *main_module_;
options_dict options_;
};
END_NS_NNCASE_RUNTIME

View File

@ -0,0 +1,50 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nncase/runtime/compiler_defs.h>
#if defined(_MSC_VER)
#ifdef NNCASE_MODULES_K210_DLL
#define NNCASE_MODULES_K210_API __declspec(dllexport)
#else
#define NNCASE_MODULES_K210_API __declspec(dllimport)
#endif
#else
#define NNCASE_MODULES_K210_API
#endif
#define BEGIN_NS_NNCASE_RT_K210 \
namespace nncase \
{ \
namespace runtime \
{ \
namespace k210 \
{
#define END_NS_NNCASE_RT_K210 \
} \
} \
}
#define BEGIN_NS_NNCASE_KERNELS_K210 \
namespace nncase \
{ \
namespace kernels \
{ \
namespace k210 \
{
#define END_NS_NNCASE_KERNELS_K210 \
} \
} \
}

View File

@ -0,0 +1,37 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "compiler_defs.h"
#include <nncase/runtime/error.h>
BEGIN_NS_NNCASE_RT_K210
enum class nncase_k210_errc
{
k210_illegal_instruction = 0x01
};
NNCASE_MODULES_K210_API const std::error_category &nncase_k210_category() noexcept;
NNCASE_MODULES_K210_API std::error_condition make_error_condition(nncase_k210_errc code);
END_NS_NNCASE_RT_K210
namespace std
{
template <>
struct is_error_condition_enum<nncase::runtime::k210::nncase_k210_errc> : true_type
{
};
}

View File

@ -0,0 +1,24 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "../runtime_module.h"
#include <nncase/kernels/kernel_context.h>
NNCASE_MODULES_K210_API
struct NNCASE_API k210_kernel_context : public kernels::kernel_context
{
};
END_NS_NNCASE_KERNELS_K210

View File

@ -0,0 +1,48 @@
/* This file is generated by tools/stackvm_gen/IsaGen at 2021/2/23 16:24:09 +08:00.
*
* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "runtime_types.h"
#include <nncase/runtime/result.h>
#include <nncase/runtime/span_reader.h>
BEGIN_NS_NNCASE_RT_K210
class NNCASE_MODULES_K210_API op_visitor
{
public:
op_visitor() noexcept
: reader_({})
{
}
~op_visitor() = default;
result<void> visit(gsl::span<const gsl::byte> text) noexcept;
virtual result<void> visit(NNCASE_UNUSED const kpu_download_options &op) noexcept { return ok(); }
virtual result<void> visit(NNCASE_UNUSED const kpu_conv2d_options &op) noexcept { return ok(); }
virtual result<void> visit(NNCASE_UNUSED const kpu_upload_options &op) noexcept { return ok(); }
protected:
bool interrupted_;
span_reader reader_;
private:
result<void> next() noexcept;
};
END_NS_NNCASE_RT_K210

View File

@ -0,0 +1,25 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "compiler_defs.h"
#include <nncase/runtime/runtime_module.h>
BEGIN_NS_NNCASE_RT_K210
NNCASE_INLINE_VAR constexpr module_type_t k210_module_type = to_module_type("k210");
NNCASE_MODULES_K210_API result<std::unique_ptr<runtime_module>> create_k210_runtime_module();
END_NS_NNCASE_RT_K210

View File

@ -0,0 +1,187 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "runtime_types.h"
BEGIN_NS_NNCASE_RT_K210
struct kpu_layout
{
size_t groups;
size_t row_len;
size_t row_pitch;
};
inline kpu_layout get_kpu_row_layout(size_t width)
{
kpu_layout layout;
if (width <= 16)
{
layout.groups = 4;
layout.row_len = 1;
layout.row_pitch = 16;
}
else if (width <= 32)
{
layout.groups = 2;
layout.row_len = 1;
layout.row_pitch = 32;
}
else
{
layout.groups = 1;
layout.row_len = (width + 63) / 64;
layout.row_pitch = 64;
}
return layout;
}
inline int32_t get_kpu_filter_size(kpu_filter_type_t filter)
{
switch (filter)
{
case kpu_filter_1x1:
return 1;
case kpu_filter_3x3:
return 3;
default:
return 0;
}
}
inline int32_t get_kpu_padding(kpu_filter_type_t filter)
{
switch (filter)
{
case kpu_filter_1x1:
return 0;
case kpu_filter_3x3:
return 1;
default:
NNCASE_UNREACHABLE();
}
}
inline std::array<int32_t, 2> get_kpu_padding(kpu_pool_type_t filter, NNCASE_UNUSED int32_t size)
{
switch (filter)
{
case kpu_pool_bypass:
return { 0, 0 };
case kpu_pool_max_2_s2:
case kpu_pool_mean_2_s2:
case kpu_pool_left_top_2_s2:
case kpu_pool_right_top_2_s2:
return { 0, 0 };
case kpu_pool_max_4_s4:
case kpu_pool_mean_4_s4:
case kpu_pool_left_top_4_s4:
return { 0, 0 };
case kpu_pool_mean_2_s1:
case kpu_pool_max_2_s1:
return { 0, 1 };
default:
NNCASE_UNREACHABLE();
}
}
inline size_t get_kpu_rows(size_t width, size_t height, size_t channels)
{
auto layout = get_kpu_row_layout(width);
auto one_line_channels = std::min(channels, layout.groups);
auto blocks = (channels + one_line_channels - 1) / one_line_channels;
auto size = layout.row_len * height * blocks;
return size;
}
inline size_t get_kpu_bytes(size_t width, size_t height, size_t channels)
{
return get_kpu_rows(width, height, channels) * 64;
}
template <class TShape>
int32_t get_kpu_bytes(const TShape &shape)
{
return get_kpu_bytes(shape[3], shape[2], shape[1]) * shape[0];
}
inline int32_t get_kpu_filter_size(kpu_pool_type_t filter)
{
switch (filter)
{
case kpu_pool_bypass:
return 1;
case kpu_pool_max_2_s2:
case kpu_pool_mean_2_s2:
case kpu_pool_left_top_2_s2:
case kpu_pool_right_top_2_s2:
case kpu_pool_max_2_s1:
case kpu_pool_mean_2_s1:
return 2;
case kpu_pool_max_4_s4:
case kpu_pool_mean_4_s4:
case kpu_pool_left_top_4_s4:
return 4;
default:
NNCASE_UNREACHABLE();
}
}
inline int32_t get_kpu_filter_stride(kpu_pool_type_t filter)
{
switch (filter)
{
case kpu_pool_bypass:
return 1;
case kpu_pool_max_2_s2:
case kpu_pool_mean_2_s2:
case kpu_pool_left_top_2_s2:
case kpu_pool_right_top_2_s2:
return 2;
case kpu_pool_max_2_s1:
case kpu_pool_mean_2_s1:
return 1;
case kpu_pool_max_4_s4:
case kpu_pool_mean_4_s4:
case kpu_pool_left_top_4_s4:
return 4;
default:
NNCASE_UNREACHABLE();
}
}
inline int32_t get_kpu_pool_output_size(int32_t input, kpu_pool_type_t pool_type)
{
return input / get_kpu_filter_stride(pool_type);
}
inline std::array<int32_t, 2> get_kpu_select_pool_offset(kpu_pool_type_t pool_type)
{
switch (pool_type)
{
case kpu_pool_left_top_2_s2:
return { 0, 0 };
case kpu_pool_right_top_2_s2:
return { 0, 1 };
case kpu_pool_left_top_4_s4:
return { 0, 0 };
default:
NNCASE_UNREACHABLE();
}
}
END_NS_NNCASE_RT_K210

View File

@ -0,0 +1,331 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "compiler_defs.h"
#include <nncase/runtime/datatypes.h>
BEGIN_NS_NNCASE_RT_K210
NNCASE_INLINE_VAR constexpr memory_location_t mem_kpu = 4;
NNCASE_INLINE_VAR constexpr size_t KPU_RAM_SIZE = 2 * 1024 * 1024; // 2MB
typedef struct
{
union
{
uint64_t reg;
struct
{
uint64_t int_en : 1;
uint64_t ram_flag : 1;
uint64_t full_add : 1;
uint64_t depth_wise_layer : 1;
uint64_t reserved : 60;
} data;
} interrupt_enabe;
union
{
uint64_t reg;
struct
{
uint64_t image_src_addr : 15;
uint64_t reserved0 : 17;
uint64_t image_dst_addr : 15;
uint64_t reserved1 : 17;
} data;
} image_addr;
union
{
uint64_t reg;
struct
{
uint64_t i_ch_num : 10;
uint64_t reserved0 : 22;
uint64_t o_ch_num : 10;
uint64_t reserved1 : 6;
uint64_t o_ch_num_coef : 10;
uint64_t reserved2 : 6;
} data;
} image_channel_num;
union
{
uint64_t reg;
struct
{
uint64_t i_row_wid : 10;
uint64_t i_col_high : 9;
uint64_t reserved0 : 13;
uint64_t o_row_wid : 10;
uint64_t o_col_high : 9;
uint64_t reserved1 : 13;
} data;
} image_size;
union
{
uint64_t reg;
struct
{
uint64_t kernel_type : 3;
uint64_t pad_type : 1;
uint64_t pool_type : 4;
uint64_t first_stride : 1;
uint64_t bypass_conv : 1;
uint64_t load_para : 1;
uint64_t reserved0 : 5;
uint64_t dma_burst_size : 8;
uint64_t pad_value : 8;
uint64_t bwsx_base_addr : 32;
} data;
} kernel_pool_type_cfg;
union
{
uint64_t reg;
struct
{
uint64_t load_coor : 1;
uint64_t load_time : 6;
uint64_t reserved0 : 8;
uint64_t para_size : 17;
uint64_t para_start_addr : 32;
} data;
} kernel_load_cfg;
union
{
uint64_t reg;
struct
{
uint64_t coef_column_offset : 4;
uint64_t coef_row_offset : 12;
uint64_t reserved0 : 48;
} data;
} kernel_offset;
union
{
uint64_t reg;
struct
{
uint64_t channel_switch_addr : 15;
uint64_t reserved : 1;
uint64_t row_switch_addr : 4;
uint64_t coef_size : 8;
uint64_t coef_group : 3;
uint64_t load_act : 1;
uint64_t active_addr : 32;
} data;
} kernel_calc_type_cfg;
union
{
uint64_t reg;
struct
{
uint64_t wb_channel_switch_addr : 15;
uint64_t reserved0 : 1;
uint64_t wb_row_switch_addr : 4;
uint64_t wb_group : 3;
uint64_t reserved1 : 41;
} data;
} write_back_cfg;
union
{
uint64_t reg;
struct
{
uint64_t shr_w : 4;
uint64_t shr_x : 4;
uint64_t arg_w : 24;
uint64_t arg_x : 24;
uint64_t reserved0 : 8;
} data;
} conv_value;
union
{
uint64_t reg;
struct
{
uint64_t arg_add : 40;
uint64_t reserved : 24;
} data;
} conv_value2;
union
{
uint64_t reg;
struct
{
uint64_t send_data_out : 1;
uint64_t reserved : 15;
uint64_t channel_byte_num : 16;
uint64_t dma_total_byte : 32;
} data;
} dma_parameter;
} kpu_layer_argument_t;
typedef struct
{
union
{
uint64_t reg;
struct
{
uint64_t shift_number : 8;
uint64_t y_mul : 16;
uint64_t x_start : 36;
} data;
} activate_para[16];
union
{
uint64_t reg;
struct
{
uint8_t result_bias[8];
} data;
} activate_para_bias0;
union
{
uint64_t reg;
struct
{
uint8_t result_bias[8];
} data;
} activate_para_bias1;
} kpu_activate_table_t;
typedef struct
{
union
{
uint64_t reg;
struct
{
uint64_t norm_mul : 24;
uint64_t norm_add : 32;
uint64_t norm_shift : 4;
} data;
} batchnorm;
} kpu_batchnorm_argument_t;
typedef enum _kpu_filter_type
{
kpu_filter_1x1 = 0,
kpu_filter_3x3 = 1
} kpu_filter_type_t;
typedef enum _kpu_pool_type
{
kpu_pool_bypass = 0,
kpu_pool_max_2_s2 = 1,
kpu_pool_mean_2_s2 = 2,
kpu_pool_max_4_s4 = 3,
kpu_pool_mean_4_s4 = 4,
kpu_pool_left_top_2_s2 = 5,
kpu_pool_right_top_2_s2 = 6,
kpu_pool_left_top_4_s4 = 7,
kpu_pool_mean_2_s1 = 8,
kpu_pool_max_2_s1 = 9
} kpu_pool_type_t;
struct kpu_batchnorm_segment
{
int32_t mul;
int32_t shift;
int32_t add;
};
inline bool operator==(const kpu_batchnorm_segment &lhs, const kpu_batchnorm_segment &rhs) noexcept
{
return lhs.mul == rhs.mul && lhs.shift == rhs.shift && lhs.add == rhs.add;
}
inline bool operator!=(const kpu_batchnorm_segment &lhs, const kpu_batchnorm_segment &rhs) noexcept
{
return !(lhs == rhs);
}
struct kpu_activation_segment
{
int64_t start_x;
int32_t mul;
int32_t shift;
int32_t add;
};
inline bool operator==(const kpu_activation_segment &lhs, const kpu_activation_segment &rhs) noexcept
{
return lhs.start_x == rhs.start_x && lhs.mul == rhs.mul
&& lhs.shift == rhs.shift && lhs.add == rhs.add;
}
inline bool operator!=(const kpu_activation_segment &lhs, const kpu_activation_segment &rhs) noexcept
{
return !(lhs == rhs);
}
using kpu_activation_table_t = std::array<kpu_activation_segment, 16>;
using kpu_shape_t = std::array<uint32_t, 4>;
enum class opcode_t : uint8_t
{
kpu_upload,
kpu_download,
kpu_conv2d
};
struct kpu_upload_options
{
opcode_t opcode = opcode_t::kpu_upload;
uint8_t reserved0[3];
memory_range input;
memory_range output;
kpu_shape_t in_shape;
};
struct kpu_download_options
{
opcode_t opcode = opcode_t::kpu_download;
uint8_t reserved0[3];
memory_range input;
memory_range output;
kpu_shape_t in_shape;
};
struct kpu_conv2d_options
{
opcode_t opcode = opcode_t::kpu_conv2d;
uint8_t reserved0[3];
memory_range weights;
memory_range batch_norm;
memory_range activation;
memory_range main_mem_output;
uint32_t batches;
kpu_layer_argument_t layer;
};
END_NS_NNCASE_RT_K210

View File

@ -0,0 +1,89 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "datatypes.h"
#include <cassert>
BEGIN_NS_NNCASE_RUNTIME
NNCASE_INLINE_VAR constexpr size_t MAX_SECTION_NAME_LENGTH = 16;
struct model_header
{
uint32_t identifier;
uint32_t version;
uint32_t flags;
uint32_t alignment;
uint32_t modules;
uint32_t main_module;
};
struct module_header
{
module_type_t type;
uint32_t size;
uint32_t mempools;
uint32_t inputs;
uint32_t outputs;
uint32_t sections;
uint32_t reserved0;
};
struct mempool_desc
{
memory_location_t location;
uint32_t size;
};
struct section_header
{
char name[MAX_SECTION_NAME_LENGTH];
uint32_t flags;
uint32_t start;
uint32_t size;
uint32_t reserved0;
};
NNCASE_INLINE_VAR constexpr uint32_t SECTION_MERGED_INTO_RDATA = 1;
struct shape_header
{
uint32_t size;
shape_header() = delete;
shape_header(shape_header &) = delete;
shape_header &operator=(shape_header &) = delete;
const uint32_t *begin() const noexcept
{
return reinterpret_cast<const uint32_t *>(reinterpret_cast<uintptr_t>(this) + sizeof(shape_header));
}
const uint32_t *end() const noexcept
{
return begin() + size;
}
uint32_t operator[](size_t index) const
{
assert(index < size);
return begin()[index];
}
};
NNCASE_INLINE_VAR constexpr uint32_t MODEL_IDENTIFIER = 'KMDL';
NNCASE_INLINE_VAR constexpr uint32_t MODEL_VERSION = 5;
END_NS_NNCASE_RUNTIME

View File

@ -0,0 +1,134 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "compiler_defs.h"
#include "span_reader.h"
#include <array>
#include <cassert>
BEGIN_NS_NNCASE_RUNTIME
typedef enum _nnil_opcode
{
nnil_nop = 0x00,
nnil_dup = 0x01,
nnil_pop = 0x02,
nnil_lda_0 = 0x03,
nnil_ldc_r4_0 = 0x04,
nnil_ldc_r4_1 = 0x05,
nnil_ldc_r4 = 0x06,
nnil_abs = 0x20,
nnil_ceil = 0x21,
nnil_cos = 0x22,
nnil_exp = 0x23,
nnil_floor = 0x24,
nnil_log = 0x25,
nnil_neg = 0x26,
nnil_rsqrt = 0x27,
nnil_sin = 0x28,
nnil_sqrt = 0x29,
nnil_square = 0x2A,
nnil_tanh = 0x2B,
nnil_bitwise_not = 0x2C,
nnil_logical_not = 0x2D,
nnil_round = 0x2E,
nnil_add = 0x40,
nnil_sub = 0x41,
nnil_mul = 0x42,
nnil_div = 0x43,
nnil_min = 0x44,
nnil_max = 0x45,
nnil_clamp = 0x80,
nnil_ret = 0xA0
} nnil_opcode_t;
typedef struct _nnil_ldc_r4
{
float r4;
} nnil_ldc_r4_t;
typedef struct _nnil_op
{
nnil_opcode_t opcode;
union
{
nnil_ldc_r4_t ldc_r4;
};
} nnil_op_t;
class nnil_reader
{
public:
nnil_reader(span_reader &reader)
: reader_(reader) { }
bool avail() const noexcept { return !reader_.empty(); }
nnil_op_t next()
{
assert(avail());
nnil_op_t op;
op.opcode = (nnil_opcode_t)reader_.read<uint8_t>();
switch (op.opcode)
{
case nnil_ldc_r4:
op.ldc_r4 = reader_.read_unaligned<nnil_ldc_r4_t>();
break;
default:
break;
}
return op;
}
private:
span_reader &reader_;
};
class nnil_evalstack
{
public:
nnil_evalstack() noexcept
: top(0)
{
}
void push(float value)
{
assert(top < _stack.size());
_stack[top++] = value;
}
float pop()
{
assert(top > 0);
return _stack[--top];
}
void dup()
{
assert(top > 0);
_stack[top] = _stack[top - 1];
top++;
}
private:
std::array<float, 64> _stack;
size_t top;
};
END_NS_NNCASE_RUNTIME

View File

@ -0,0 +1,373 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "compiler_defs.h"
#include <functional>
#include <mpark/variant.hpp>
#include <system_error>
#include <type_traits>
namespace nncase
{
#define try_(x) \
{ \
auto v = (x); \
if (!v.is_ok()) \
return nncase::err(std::move(v.unwrap_err())); \
}
#define try_var(name, x) \
typename decltype((x))::traits::ok_type name; \
{ \
auto v = (x); \
if (v.is_ok()) \
name = std::move(v.unwrap()); \
else \
return nncase::err(std::move(v.unwrap_err())); \
}
#define try_var_err(name, x, e) \
typename decltype((x))::traits::ok_type name; \
{ \
auto v = (x); \
if (v.is_ok()) \
{ \
name = std::move(v.unwrap()); \
} \
else \
{ \
e = nncase::err(std::move(v.unwrap_err())); \
return; \
} \
}
#define try_set(name, x) \
{ \
auto v = (x); \
if (v.is_ok()) \
name = std::move(v.unwrap()); \
else \
return nncase::err(std::move(v.unwrap_err())); \
}
template <class T>
struct Ok
{
constexpr Ok(T &&value)
: value(std::move(value)) { }
constexpr Ok(const T &value)
: value(value) { }
template <class... Args>
constexpr explicit Ok(mpark::in_place_t, Args &&...args)
: value(std::forward<Args>(args)...) { }
T value;
};
template <>
struct Ok<void>
{
};
struct Err
{
template <class ErrCode, class = std::enable_if_t<std::is_error_condition_enum<ErrCode>::value>>
Err(ErrCode value)
: err(value) { }
Err(std::error_condition err)
: err(std::move(err)) { }
std::error_condition err;
};
inline constexpr Ok<void> ok()
{
return {};
}
template <class T, class... Args>
constexpr Ok<T> ok(Args &&...args)
{
return Ok<T>(mpark::in_place, std::forward<Args>(args)...);
}
template <class T>
constexpr Ok<std::decay_t<T>> ok(T &&value)
{
return Ok<std::decay_t<T>>(std::forward<T>(value));
}
inline Err err(std::error_condition value) noexcept
{
return Err(std::move(value));
}
template <class ErrCode, class = std::enable_if_t<std::is_error_condition_enum<ErrCode>::value>>
Err err(ErrCode value)
{
return err(std::error_condition(value));
}
template <class T>
class NNCASE_NODISCARD result;
namespace detail
{
template <class T>
NNCASE_INLINE_VAR bool constexpr is_result_v = false;
template <class T>
NNCASE_INLINE_VAR bool constexpr is_result_v<result<T>> = true;
template <class T>
struct result_traits
{
static_assert(!is_result_v<T>, "Cannot use nested result");
using ok_type = T;
};
template <class T, class U, class Func>
class map_call_impl
{
result<U> operator()(Func &&func, Ok<T> &value) noexcept
{
return ok(func(value.value));
}
};
template <class T, class Func>
struct map_traits;
template <class U, class Func>
class map_call_void_impl
{
result<U> operator()(Func &&func) noexcept
{
return ok(func());
}
};
template <class Func>
struct map_traits<void, Func>
{
using U = invoke_result_t<Func>;
static_assert(!is_result_v<U>, "Cannot map a callback returning result, use and_then instead");
result<U> operator()(Func &&func, NNCASE_UNUSED Ok<void> &value) noexcept
{
return map_call_void_impl<U, Func>()(std::forward<Func>(func));
}
};
template <class T, class Func>
struct map_err_traits;
template <class T, class Func>
struct and_then_traits
{
using result_t = invoke_result_t<Func, T>;
using traits_t = typename result_t::traits;
using U = typename traits_t::ok_type;
static_assert(is_result_v<result_t>, "Cannot then a callback not returning result, use map instead");
result_t operator()(Func &&func, Ok<T> &value) noexcept
{
return func(value.value);
}
};
template <class Func>
struct and_then_traits<void, Func>
{
using result_t = invoke_result_t<Func>;
using traits_t = typename result_t::traits;
using U = typename traits_t::ok_type;
static_assert(is_result_v<result_t>, "Cannot then a callback not returning result, use map instead");
result_t operator()(Func &&func, NNCASE_UNUSED Ok<void> &value) noexcept
{
return func();
}
};
template <class T>
struct unwrap_impl
{
T &operator()(Ok<T> &value) noexcept
{
return value.value;
}
T &&operator()(Ok<T> &&value) noexcept
{
return std::move(value.value);
}
};
template <>
struct unwrap_impl<void>
{
void operator()(NNCASE_UNUSED Ok<void> &value) noexcept
{
}
void operator()(NNCASE_UNUSED Ok<void> &&value) noexcept
{
}
};
}
template <class T>
class NNCASE_NODISCARD result
{
public:
using traits = detail::result_traits<T>;
constexpr result(Ok<T> value)
: ok_or_err_(std::move(value)) { }
result(Err err)
: ok_or_err_(std::move(err)) { }
constexpr bool is_ok() const noexcept { return ok_or_err_.index() == 0; }
constexpr bool is_err() const noexcept { return ok_or_err_.index() == 1; }
constexpr decltype(auto) unwrap() noexcept
{
if (is_ok())
return detail::unwrap_impl<T>()(value());
else
std::terminate();
}
constexpr decltype(auto) unwrap_or_throw() &
{
if (is_ok())
return detail::unwrap_impl<T>()(value());
else
throw std::runtime_error(unwrap_err().message());
}
constexpr decltype(auto) unwrap_or_throw() &&
{
if (is_ok())
return detail::unwrap_impl<T>()(std::move(value()));
else
throw std::runtime_error(unwrap_err().message());
}
constexpr std::error_condition &unwrap_err() noexcept
{
if (is_ok())
std::terminate();
else
return err().err;
}
constexpr auto expect(NNCASE_UNUSED gsl::cstring_span message) noexcept
{
if (is_ok())
return detail::unwrap_impl<T>()(value());
else
std::terminate();
}
template <class Func, class Traits = detail::map_traits<T, Func>>
constexpr typename Traits::result_t map(Func &&func) noexcept
{
if (is_ok())
return Traits()(std::forward<Func>(func), value());
else
return err();
}
template <class Func, class Traits = detail::map_err_traits<T, Func>>
constexpr typename Traits::result_t map_err(Func &&func) noexcept
{
if (is_ok())
return value();
else
return Traits()(std::forward<Func>(func), err());
}
template <class Func, class Traits = detail::and_then_traits<T, Func>>
constexpr typename Traits::result_t and_then(Func &&func) noexcept
{
if (is_ok())
return Traits()(std::forward<Func>(func), value());
else
return err();
}
private:
constexpr Ok<T> &&value() &&noexcept { return mpark::get<Ok<T>>(ok_or_err_); }
constexpr Ok<T> &value() &noexcept { return mpark::get<Ok<T>>(ok_or_err_); }
constexpr Err &err() noexcept { return mpark::get<Err>(ok_or_err_); }
private:
mpark::variant<Ok<T>, Err> ok_or_err_;
};
namespace detail
{
template <class T, class Func>
struct map_traits
{
using U = invoke_result_t<Func, T>;
static_assert(!is_result_v<U>, "Cannot map a callback returning result, use and_then instead");
using result_t = result<U>;
result<U> operator()(Func &&func, Ok<T> &value) noexcept
{
return map_call_impl<T, U, Func>()(std::forward<Func>(func), value);
}
};
template <class T, class Func>
struct map_err_traits
{
using U = invoke_result_t<Func, Err>;
static_assert(!is_result_v<U>, "Cannot map a callback returning result, use and_then instead");
result<U> operator()(Func &&func, Err &value) noexcept
{
return err(func(value.err));
}
};
template <class T, class Func>
class map_call_impl<T, void, Func>
{
result<void> operator()(Func &&func, Ok<T> &value) noexcept
{
func(value.value);
return ok();
}
};
template <class Func>
class map_call_void_impl<void, Func>
{
result<void> operator()(Func &&func) noexcept
{
func();
return ok();
}
};
}
}

View File

@ -0,0 +1,36 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <memory>
#include <nncase/runtime/error.h>
#include <nncase/runtime/model.h>
#include <nncase/runtime/result.h>
#include <nncase/runtime/runtime_module.h>
BEGIN_NS_NNCASE_RUNTIME
typedef void (*rt_module_activator_t)(result<std::unique_ptr<runtime_module>> &result);
#define RUNTIME_MODULE_ACTIVATOR_NAME create_runtime_module
struct runtime_registration
{
module_type_t id;
rt_module_activator_t activator;
};
extern runtime_registration builtin_runtimes[];
END_NS_NNCASE_RUNTIME

View File

@ -0,0 +1,94 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "model.h"
#include "result.h"
#include "runtime_tensor.h"
BEGIN_NS_NNCASE_RUNTIME
class interpreter;
struct NNCASE_API runtime_module_init_context
{
virtual bool is_section_pinned() const noexcept = 0;
virtual interpreter &interp() noexcept = 0;
virtual const module_header &header() noexcept = 0;
virtual gsl::span<const gsl::byte> section(const char *name) noexcept = 0;
};
class NNCASE_API runtime_module
{
private:
struct inout_tensor_info
{
runtime_shape_t shape;
runtime_shape_t strides;
memory_range range;
runtime_tensor bind_tensor;
runtime_tensor staging_tensor;
runtime_tensor device_tensor;
};
public:
static result<std::unique_ptr<runtime_module>> create(const module_type_t &type);
runtime_module() = default;
runtime_module(runtime_module &) = delete;
virtual ~runtime_module() = default;
result<void> initialize(const module_header &header, interpreter &interp) noexcept;
virtual result<void> initialize_inter_modules(interpreter &interp) noexcept;
const module_type_t &type() const noexcept;
interpreter &interp() const noexcept { return *interp_; }
uint32_t mempools_size() const noexcept;
const mempool_desc &mempool(size_t index) const noexcept;
mempool_desc mempool(memory_location_t location) const noexcept;
uint32_t inputs_size() const noexcept;
const runtime_shape_t &input_shape(size_t index) const noexcept;
const memory_range &input_desc(size_t index) const noexcept;
result<runtime_tensor> input_tensor(size_t index) noexcept;
result<void> input_tensor(size_t index, runtime_tensor tensor) noexcept;
uint32_t outputs_size() const noexcept;
const runtime_shape_t &output_shape(size_t index) const noexcept;
const memory_range &output_desc(size_t index) const noexcept;
result<runtime_tensor> output_tensor(size_t index) noexcept;
result<void> output_tensor(size_t index, runtime_tensor tensor) noexcept;
result<void> run() noexcept;
protected:
virtual result<void> initialize_core(runtime_module_init_context &context) noexcept = 0;
virtual result<runtime_tensor> allocate_input_tensor(size_t index) noexcept = 0;
virtual result<runtime_tensor> allocate_output_tensor(size_t index) noexcept = 0;
virtual result<void> validate_input_tensor(size_t index, runtime_tensor tensor) noexcept = 0;
virtual result<void> validate_output_tensor(size_t index, runtime_tensor tensor) noexcept = 0;
result<runtime_tensor> device_input_tensor(size_t index) noexcept;
result<runtime_tensor> device_output_tensor(size_t index) noexcept;
virtual result<void> run_core() noexcept = 0;
private:
module_header header_;
std::vector<mempool_desc> mempools_;
std::vector<inout_tensor_info> input_tensors_;
std::vector<inout_tensor_info> output_tensors_;
interpreter *interp_ = nullptr;
};
END_NS_NNCASE_RUNTIME

View File

@ -0,0 +1,259 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "datatypes.h"
#include "result.h"
BEGIN_NS_NNCASE_RUNTIME
inline constexpr size_t get_bytes(datatype_t type)
{
return nncase::detail::datatype_bytes(type);
}
inline size_t compute_size(const runtime_shape_t &shape)
{
return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
}
inline size_t get_bytes(datatype_t type, const runtime_shape_t &shape)
{
return compute_size(shape) * get_bytes(type);
}
inline size_t compute_size(const runtime_shape_t &shape, const runtime_shape_t &strides)
{
size_t max_stride = 0, max_shape = 0;
for (size_t i = 0; i < shape.size(); i++)
{
if ((shape[i] == 1 ? 0 : strides[i]) > max_stride)
{
max_stride = strides[i];
max_shape = shape[i];
}
}
size_t size = max_stride * max_shape;
return size ? size : 1;
}
inline size_t get_bytes(datatype_t type, const runtime_shape_t &shape, const runtime_shape_t &strides)
{
return compute_size(shape, strides) * get_bytes(type);
}
namespace detail
{
template <class shape_type, class strides_type>
inline void adapt_strides(const shape_type &shape, strides_type &strides,
std::nullptr_t, typename strides_type::size_type i) noexcept
{
if (shape[i] == 1)
{
strides[i] = 0;
}
}
template <class shape_type, class strides_type, class bs_ptr>
inline std::size_t compute_strides(const shape_type &shape,
strides_type &strides, bs_ptr bs)
{
using strides_value_type = typename std::decay_t<strides_type>::value_type;
strides_value_type data_size = 1;
for (std::size_t i = shape.size(); i != 0; --i)
{
strides[i - 1] = data_size;
data_size = strides[i - 1] * static_cast<strides_value_type>(shape[i - 1]);
adapt_strides(shape, strides, bs, i - 1);
}
return static_cast<std::size_t>(data_size);
}
}
template <class shape_type, class strides_type>
inline std::size_t compute_strides(const shape_type &shape, strides_type &strides)
{
return detail::compute_strides(shape, strides, nullptr);
}
inline runtime_shape_t get_default_strides(const runtime_shape_t &shape)
{
runtime_shape_t strides(shape.size());
compute_strides(shape, strides);
return strides;
}
template <class TShape>
TShape convert_shape_type(const TShape &shape, datatype_t src, datatype_t dest)
{
const auto src_size = get_bytes(src);
const auto dest_size = get_bytes(dest);
TShape new_shape = shape;
if (!new_shape.empty())
{
auto &v = new_shape.back();
v = new_shape.back() * src_size / dest_size;
}
return new_shape;
}
template <class TShape>
result<TShape> convert_strides_type(const TShape &strides, datatype_t src, datatype_t dest)
{
const auto src_size = get_bytes(src);
const auto dest_size = get_bytes(dest);
if (src_size == dest_size)
return ok(strides);
TShape new_strides = strides;
// 1. Except last dim
for (size_t i = 0; i < new_strides.size() - 1; i++)
{
auto &v = new_strides[i];
if (v == 0)
v = 1;
v = v * src_size / dest_size;
}
// 2. Last dim
if (!new_strides.empty())
{
// 2.1. If last dim is not 0 or 1, unsupported
auto last_dim = new_strides.back();
if (last_dim != 0 || last_dim != 1)
return err(std::errc::not_supported);
}
return ok(new_strides);
}
template <int32_t Bits, class T>
uint8_t count_leading_zeros(T value)
{
uint8_t num_zeroes = 0;
for (int32_t i = Bits - 1; i >= 0; i--)
{
if ((value & (1ULL << i)) == 0)
++num_zeroes;
else
break;
}
return num_zeroes;
}
template <class T = uint64_t>
inline T bit_mask(uint8_t shift)
{
return (T(1) << shift) - 1;
}
template <class T, bool Banker = false>
T carry_shift(T value, int32_t shift)
{
if (shift > 0)
{
if (Banker)
{
T result;
// Sign | Int (T - shift - 1 bits) | Frac (shift bits)
// S IIII FFF
auto integral = value >> shift;
auto fractional = value & bit_mask(shift);
auto sign = value < 0 ? -1 : 1;
auto half = size_t(1) << (shift - 1);
// frac < 0.5
if (fractional < half)
{
return integral;
}
// frac > 0.5
else if (fractional > half)
{
return integral + sign;
}
// frac == 0.5
else
{
// odd
if (integral & 1)
return integral + sign;
// even
else
return integral;
}
return result;
}
else
{
value += T(1) << (shift - 1);
value >>= shift;
}
}
else if (shift < 0)
{
value = value << (-shift);
}
return value;
}
template <bool Banker = false>
inline int32_t mul_and_carry_shift(int32_t value, int32_t mul, int32_t shift)
{
return (int32_t)carry_shift<int64_t, Banker>((int64_t)value * mul, shift);
}
template <class T>
inline T clamp(T value, T min, T max)
{
return std::min(max, std::max(value, min));
}
template <uint8_t Bits>
inline int32_t clamp(int32_t value)
{
auto min = std::numeric_limits<int32_t>::lowest() >> (32 - Bits);
auto max = std::numeric_limits<int32_t>::max() >> (32 - Bits);
return clamp(value, min, max);
}
template <class TShape>
inline bool is_contiguous(const TShape &shape, const TShape &strides)
{
return get_default_strides(shape) == strides;
}
inline int get_last_not_contiguous_index(const runtime_shape_t &strides, const runtime_shape_t &default_strides)
{
for (int i = strides.size() - 1; i >= 0; --i)
{
if (strides[i] != default_strides[i])
{
return i + 1;
}
}
return -1;
}
template<size_t A, size_t B>
constexpr auto is_not_equal = std::integral_constant<bool, std::not_equal_to<size_t> {}(A, B)> {};
struct DefaultCallable {};
END_NS_NNCASE_RUNTIME

View File

@ -0,0 +1,148 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "model.h"
#include "result.h"
#include <functional>
#include <memory>
BEGIN_NS_NNCASE_RUNTIME
struct runtime_tensor_type
{
const char *data;
explicit runtime_tensor_type(const char *data) noexcept
: data(data)
{
}
runtime_tensor_type(runtime_tensor_type &) = delete;
runtime_tensor_type &operator=(runtime_tensor_type &) = delete;
};
inline bool operator==(runtime_tensor_type &lhs, runtime_tensor_type &rhs) noexcept
{
return &lhs == &rhs;
}
inline bool operator!=(runtime_tensor_type &lhs, runtime_tensor_type &rhs) noexcept
{
return &lhs != &rhs;
}
namespace detail
{
class runtime_tensor_impl;
class host_runtime_tensor_impl;
}
class NNCASE_API runtime_tensor
{
public:
runtime_tensor() noexcept;
runtime_tensor(std::shared_ptr<detail::runtime_tensor_impl> impl) noexcept;
datatype_t datatype() const noexcept;
const runtime_shape_t &shape() const noexcept;
const runtime_shape_t &strides() const noexcept;
runtime_tensor_type &tensor_type() const noexcept;
bool empty() const noexcept;
bool is_host() const noexcept;
bool is_contiguous() const noexcept;
detail::runtime_tensor_impl *impl() noexcept { return impl_.get(); }
const detail::runtime_tensor_impl *impl() const noexcept { return impl_.get(); }
bool can_copy_to_without_staging(const runtime_tensor &dest) const noexcept;
result<void> copy_to(runtime_tensor &dest) noexcept;
result<runtime_tensor> as_host() noexcept;
void reset() noexcept;
private:
std::shared_ptr<detail::runtime_tensor_impl> impl_;
};
NNCASE_API bool operator==(const runtime_tensor &lhs, const runtime_tensor &rhs) noexcept;
NNCASE_API bool operator!=(const runtime_tensor &lhs, const runtime_tensor &rhs) noexcept;
namespace host_runtime_tensor
{
typedef enum memory_pool_
{
pool_cpu_only,
pool_shared
} memory_pool_t;
typedef enum sync_op_
{
sync_invalidate,
sync_write_back
} sync_op_t;
typedef enum map_access_
{
map_none = 0,
map_read = 1,
map_write = 2,
map_read_write = 3
} map_access_t;
DEFINE_ENUM_BITMASK_OPERATORS(map_access_t)
class NNCASE_API mapped_buffer
{
public:
mapped_buffer() noexcept;
mapped_buffer(detail::host_runtime_tensor_impl &impl, map_access_t access, uintptr_t address, size_t size_bytes) noexcept;
mapped_buffer(mapped_buffer &&other) noexcept;
mapped_buffer(const mapped_buffer &) = delete;
~mapped_buffer();
mapped_buffer &operator=(mapped_buffer &&) noexcept;
mapped_buffer &operator=(const mapped_buffer &) = delete;
result<void> unmap() noexcept;
gsl::span<gsl::byte> buffer() const noexcept
{
return { reinterpret_cast<gsl::byte *>(address_), size_bytes_ };
}
private:
detail::host_runtime_tensor_impl *impl_;
map_access_t access_;
uintptr_t address_;
size_t size_bytes_;
};
typedef std::function<void(gsl::byte *)> data_deleter_t;
NNCASE_API runtime_tensor_type &tensor_type() noexcept;
NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept;
NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, gsl::span<gsl::byte> data, bool copy, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept;
NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, gsl::span<gsl::byte> data, data_deleter_t data_deleter, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept;
NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept;
NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, gsl::span<gsl::byte> data, bool copy, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept;
NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, gsl::span<gsl::byte> data, data_deleter_t data_deleter, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept;
NNCASE_API result<memory_pool_t> memory_pool(const runtime_tensor &tensor) noexcept;
NNCASE_API result<mapped_buffer> map(runtime_tensor &tensor, map_access_t access) noexcept;
NNCASE_API result<void> sync(runtime_tensor &tensor, sync_op_t op, bool force = false) noexcept;
}
namespace hrt = host_runtime_tensor;
END_NS_NNCASE_RUNTIME

View File

@ -0,0 +1,49 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "runtime_tensor.h"
BEGIN_NS_NNCASE_RUNTIME
namespace detail
{
class NNCASE_API runtime_tensor_impl
{
public:
virtual ~runtime_tensor_impl() = default;
virtual datatype_t datatype() const noexcept = 0;
virtual const runtime_shape_t &shape() const noexcept = 0;
virtual const runtime_shape_t &strides() const noexcept = 0;
virtual runtime_tensor_type &tensor_type() const noexcept = 0;
bool is_host() const noexcept;
bool is_contiguous() const noexcept;
bool can_copy_to_without_staging(const runtime_tensor &dest) const noexcept;
result<void> copy_to(runtime_tensor &dest) noexcept;
result<runtime_tensor> copy_as_host() noexcept;
virtual bool can_copy_from_different_type(const runtime_tensor_impl &src) const noexcept;
virtual bool can_copy_to_different_type(const runtime_tensor_impl &dest) const noexcept;
virtual result<void> copy_to_same_type(runtime_tensor_impl &dest) noexcept;
virtual result<void> copy_from_different_type(runtime_tensor_impl &src) noexcept;
virtual result<void> copy_to_different_type(runtime_tensor_impl &dest) noexcept;
virtual result<void> copy_from_host(runtime_tensor_impl &src) noexcept;
virtual result<void> copy_to_host(runtime_tensor_impl &dest) noexcept;
};
}
END_NS_NNCASE_RUNTIME

View File

@ -0,0 +1,31 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "runtime_tensor_impl.h"
BEGIN_NS_NNCASE_RUNTIME
namespace detail
{
class host_runtime_tensor_impl;
}
END_NS_NNCASE_RUNTIME
#ifndef NNCASE_SHARED_RUNTIME_TENSOR_PLATFORM_HEADER
#include "shared_runtime_tensor.platform.h"
#else
#include NNCASE_SHARED_RUNTIME_TENSOR_PLATFORM_HEADER
#endif

View File

@ -0,0 +1,44 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "runtime_tensor_impl.h"
BEGIN_NS_NNCASE_RUNTIME
namespace detail
{
struct host_memory_block;
struct NNCASE_API physical_memory_block
{
uintptr_t physical_address;
bool owned;
physical_memory_block() noexcept;
~physical_memory_block();
physical_memory_block(const physical_memory_block &) = delete;
physical_memory_block(physical_memory_block &&other) noexcept;
physical_memory_block &operator=(const physical_memory_block &) = delete;
physical_memory_block &operator=(physical_memory_block &&other) noexcept;
result<void> free() noexcept;
static result<void> acknowledge(host_memory_block &block) noexcept;
static result<void> allocate(host_memory_block &block) noexcept;
static result<void> sync(host_memory_block &block, host_runtime_tensor::sync_op_t op) noexcept;
};
}
END_NS_NNCASE_RUNTIME

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,142 @@
/* Copyright 2019-2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "compiler_defs.h"
#include <cstring>
#include <gsl/gsl-lite.hpp>
BEGIN_NS_NNCASE_RUNTIME
class span_reader
{
public:
span_reader(gsl::span<const gsl::byte> span)
: span_(span)
{
}
bool empty() const noexcept { return span_.empty(); }
size_t avail() const noexcept { return span_.size_bytes(); }
template <class T>
T read()
{
auto value = *reinterpret_cast<const T *>(span_.data());
advance(sizeof(T));
return value;
}
template <class T>
T read_unaligned()
{
alignas(T) uint8_t storage[sizeof(T)];
std::memcpy(storage, span_.data(), sizeof(T));
advance(sizeof(T));
return *reinterpret_cast<const T *>(storage);
}
template <class T>
void read(T &value)
{
value = *reinterpret_cast<const T *>(span_.data());
advance(sizeof(T));
}
template <class T>
void read_span(gsl::span<const T> &span, size_t size)
{
span = { reinterpret_cast<const T *>(span_.data()), size };
advance(sizeof(T) * size);
}
template <class T = gsl::byte>
gsl::span<const T> read_span(size_t size)
{
gsl::span<const T> span(reinterpret_cast<const T *>(span_.data()), size);
advance(sizeof(T) * size);
return span;
}
void read_avail(gsl::span<const gsl::byte> &span)
{
span = span_;
span_ = {};
}
gsl::span<const gsl::byte> read_avail()
{
auto span = span_;
span_ = {};
return span;
}
gsl::span<const gsl::byte> peek_avail()
{
return span_;
}
template <class T>
T peek()
{
auto value = *reinterpret_cast<const T *>(span_.data());
return value;
}
template <class T>
T peek_unaligned()
{
T value;
std::memcpy(&value, span_.data(), sizeof(T));
return value;
}
template <class T>
T peek_unaligned_with_offset(size_t offset)
{
T value;
std::memcpy(&value, span_.data() + offset, sizeof(T));
return value;
}
template <class T>
const T *get_ref()
{
auto ptr = reinterpret_cast<const T *>(span_.data());
advance(sizeof(T));
return ptr;
}
template <class T>
void get_ref(const T *&ptr)
{
ptr = get_ref<T>();
}
void skip(size_t count)
{
advance(count);
}
private:
void advance(size_t count)
{
span_ = span_.subspan(count);
}
private:
gsl::span<const gsl::byte> span_;
};
END_NS_NNCASE_RUNTIME

View File

@ -0,0 +1,25 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nncase/kernels/kernel_context.h>
BEGIN_NS_NNCASE_RT_STACKVM
struct NNCASE_API stackvm_kernel_context : public kernels::kernel_context
{
int num_threads_ = 4;
};
END_NS_NNCASE_RT_STACKVM

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,24 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "../runtime_module.h"
BEGIN_NS_NNCASE_RT_STACKVM
NNCASE_INLINE_VAR constexpr module_type_t stackvm_module_type = to_module_type("stackvm");
NNCASE_API result<std::unique_ptr<runtime_module>> create_stackvm_runtime_module();
END_NS_NNCASE_RT_STACKVM

View File

@ -0,0 +1,17 @@
/* Copyright 2020 Canaan Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#define NNCASE_VERSION "1.0.0"
#define NNCASE_GIT_DESC "15b0a90-dirty"

View File

@ -0,0 +1 @@
include(${CMAKE_CURRENT_LIST_DIR}/nncase_rt_modules_k210Targets.cmake)

Some files were not shown because too many files have changed in this diff Show More