Rollback 4f73860...
parent
0c5fa2471a
commit
15ca3baf45
|
@ -680,7 +680,6 @@ typedef struct
|
|||
const uint8_t *volatile current_body;
|
||||
dmac_channel_number_t dma_ch;
|
||||
kpu_done_callback_t done_callback;
|
||||
volatile uint8_t load_first;
|
||||
void *userdata;
|
||||
};
|
||||
|
||||
|
|
|
@ -1006,8 +1006,12 @@ static void kpu_quantize(const kpu_model_quantize_layer_argument_t *arg, kpu_mod
|
|||
size_t count = arg->count;
|
||||
const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
||||
|
||||
kpu_model_quant_param_t q = arg->quant_param;
|
||||
|
||||
kpu_model_quant_param_t q;
|
||||
#if FIX_CACHE
|
||||
memcpy(&q, &arg->quant_param, sizeof(kpu_model_quant_param_t));
|
||||
#else
|
||||
q = arg->quant_param;
|
||||
#endif
|
||||
float scale = 1.f / q.scale;
|
||||
|
||||
uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->mem_out_address);
|
||||
|
@ -1028,8 +1032,12 @@ static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t *a
|
|||
const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
||||
float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
||||
size_t oc, count = arg->count;
|
||||
kpu_model_quant_param_t q = arg->quant_param;
|
||||
|
||||
kpu_model_quant_param_t q;
|
||||
#if FIX_CACHE
|
||||
memcpy(&q, &arg->quant_param, sizeof(kpu_model_quant_param_t));
|
||||
#else
|
||||
q = arg->quant_param;
|
||||
#endif
|
||||
for(oc = 0; oc < count; oc++)
|
||||
dest[oc] = *src++ * q.scale + q.bias;
|
||||
}
|
||||
|
@ -1134,7 +1142,10 @@ static void kpu_kmodel_fully_connected(const kpu_model_fully_connected_layer_arg
|
|||
const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
||||
float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
||||
uint32_t in_channels = arg->in_channels, out_channels = arg->out_channels, ic, oc;
|
||||
const float *weights = arg->weights, *bias = arg->weights + in_channels * out_channels;
|
||||
float *weights = (float *)malloc(in_channels * out_channels * sizeof(float));
|
||||
float *bias = (float *)malloc(out_channels * sizeof(float));
|
||||
memcpy(weights, arg->weights, out_channels * in_channels * sizeof(float));
|
||||
memcpy(bias, arg->weights + in_channels * out_channels, out_channels * sizeof(float));
|
||||
|
||||
if(in_channels % 8 == 0)
|
||||
{
|
||||
|
@ -1181,7 +1192,8 @@ static void kpu_kmodel_fully_connected(const kpu_model_fully_connected_layer_arg
|
|||
dest[oc] = sum + bias[oc];
|
||||
}
|
||||
}
|
||||
|
||||
free(weights);
|
||||
free(bias);
|
||||
kpu_float_activation(dest, out_channels, arg->act);
|
||||
}
|
||||
|
||||
|
@ -1265,10 +1277,9 @@ static void kpu_logistic(const kpu_model_logistic_layer_argument_t *arg, kpu_mod
|
|||
static void kpu_conv(const kpu_model_conv_layer_argument_t *arg, kpu_model_context_t *ctx)
|
||||
{
|
||||
volatile kpu_layer_argument_t layer = *(const volatile kpu_layer_argument_t *)(ctx->model_buffer + arg->layer_offset);
|
||||
|
||||
layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - 0x40000000;
|
||||
layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - 0x40000000;
|
||||
layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - 0x40000000;
|
||||
layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset);
|
||||
layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset);
|
||||
layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset);
|
||||
|
||||
if(arg->flags & KLF_MAIN_MEM_OUT)
|
||||
{
|
||||
|
@ -1360,14 +1371,14 @@ static void kpu_upload(const kpu_model_upload_layer_argument_t *arg, kpu_model_c
|
|||
|
||||
int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
|
||||
{
|
||||
#if FIX_CACHE
|
||||
configASSERT(!is_memory_cache((uintptr_t)buffer));
|
||||
#endif
|
||||
uintptr_t base_addr = (uintptr_t)buffer;
|
||||
const kpu_kmodel_header_t *header = (const kpu_kmodel_header_t *)buffer;
|
||||
|
||||
configASSERT(is_memory_cache((uintptr_t)buffer))
|
||||
|
||||
if(header->version == 3 && header->arch == 0)
|
||||
{
|
||||
ctx->load_first = 1;
|
||||
ctx->is_nncase = 0;
|
||||
ctx->model_buffer = buffer;
|
||||
ctx->output_count = header->output_count;
|
||||
|
@ -1482,7 +1493,6 @@ static int kpu_kmodel_done(kpu_model_context_t *ctx)
|
|||
.calc_done_int = 1,
|
||||
.layer_cfg_almost_empty_int = 1,
|
||||
.layer_cfg_almost_full_int = 1};
|
||||
ctx->load_first = 0;
|
||||
#if KPU_DEBUG
|
||||
uint32_t cnt_layer_id = ctx->current_layer - 1;
|
||||
uint64_t time = sysctl_get_time_us();
|
||||
|
@ -1582,13 +1592,6 @@ static int ai_step(void *userdata)
|
|||
kpu_logistic((const kpu_model_logistic_layer_argument_t *)layer_body, ctx);
|
||||
break;
|
||||
case KL_K210_CONV:
|
||||
if(ctx->load_first)
|
||||
{
|
||||
for(int i=0; i<cnt_layer_header->body_size; i++)
|
||||
{
|
||||
*((uint8_t *)((uintptr_t)layer_body-0x40000000)+i) = *((uint8_t *)layer_body+i);
|
||||
}
|
||||
}
|
||||
kpu_conv((const kpu_model_conv_layer_argument_t *)layer_body, ctx);
|
||||
return 0;
|
||||
case KL_K210_ADD_PADDING:
|
||||
|
|
|
@ -75,6 +75,17 @@ namespace kernels
|
|||
const auto g_ic = in_shape[1] / groups;
|
||||
const auto g_oc = out_channels / groups;
|
||||
|
||||
#if FIX_CACHE
|
||||
float *cache_weights = new float[(size_t)out_channels * in_shape[1] / groups * filter_h * filter_w];
|
||||
float *cache_bias = new float[out_channels];
|
||||
|
||||
memcpy(cache_weights, weights, (size_t)out_channels * in_shape[1] / groups * filter_h * filter_w*sizeof(float));
|
||||
memcpy(cache_bias, bias, out_channels*sizeof(float));
|
||||
#else
|
||||
float *cache_weights = weights;
|
||||
float *cache_bias = bias;
|
||||
#endif
|
||||
|
||||
for (int32_t batch = 0; batch < in_shape[0]; batch++)
|
||||
{
|
||||
const float *in_batch_p = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
|
||||
|
@ -82,7 +93,7 @@ namespace kernels
|
|||
for (int32_t og = 0; og < groups; og++)
|
||||
{
|
||||
const float *in_group_p = in_batch_p + (size_t)og * g_ic * in_shape[2] * in_shape[3];
|
||||
const float *w_group_p = weights + (size_t)og * g_oc * g_ic * filter_h * filter_w;
|
||||
const float *w_group_p = cache_weights + (size_t)og * g_oc * g_ic * filter_h * filter_w;
|
||||
|
||||
for (int32_t oc = 0; oc < g_oc; oc++)
|
||||
{
|
||||
|
@ -98,7 +109,7 @@ namespace kernels
|
|||
const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
|
||||
const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
|
||||
const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
|
||||
float value = bias[og * g_oc + oc];
|
||||
float value = cache_bias[og * g_oc + oc];
|
||||
|
||||
for (int32_t ic = 0; ic < g_ic; ic++)
|
||||
{
|
||||
|
@ -126,6 +137,10 @@ namespace kernels
|
|||
}
|
||||
}
|
||||
}
|
||||
#if FIX_CACHE
|
||||
delete []cache_weights;
|
||||
delete []cache_bias;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class TQ>
|
||||
|
@ -141,11 +156,17 @@ namespace kernels
|
|||
|
||||
inline void matmul(const float *input_a, const float *input_b, float *output, const float *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, const value_range<float> &fused_activation)
|
||||
{
|
||||
#if FIX_CACHE
|
||||
float *cache_mem = new float[b_cols];
|
||||
memcpy(cache_mem, bias, b_cols*sizeof(float));
|
||||
#else
|
||||
const float *cache_mem =bias;
|
||||
#endif
|
||||
for (size_t oy = 0; oy < a_rows; oy++)
|
||||
{
|
||||
for (size_t ox = 0; ox < b_cols; ox++)
|
||||
{
|
||||
float value = bias[ox];
|
||||
float value = cache_mem[ox];
|
||||
|
||||
for (size_t i = 0; i < a_cols; i++)
|
||||
{
|
||||
|
@ -157,6 +178,9 @@ namespace kernels
|
|||
output[oy * b_cols + ox] = details::apply_activation(value, fused_activation);
|
||||
}
|
||||
}
|
||||
#if FIX_CACHE
|
||||
delete []cache_mem;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class T>
|
||||
|
|
|
@ -55,9 +55,6 @@ namespace runtime
|
|||
|
||||
void run(run_callback_t callback, error_callback_t on_error, node_profile_callback_t node_profile, void *userdata);
|
||||
|
||||
public:
|
||||
uint8_t load_first_;
|
||||
|
||||
protected:
|
||||
virtual bool initialize();
|
||||
virtual xtl::span<uint8_t> memory_at(const memory_range &range) const noexcept;
|
||||
|
|
|
@ -61,9 +61,9 @@ namespace runtime
|
|||
reader.skip(layer.kernel_load_cfg.data.para_start_addr);
|
||||
reader.read_span(weights, weights_size);
|
||||
#if !NNCASE_TARGET_K210_SIMULATOR
|
||||
layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)batch_norm.data() - 0x40000000;
|
||||
layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)activation - 0x40000000;
|
||||
layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)weights.data() - 0x40000000;
|
||||
layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)batch_norm.data();
|
||||
layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)activation;
|
||||
layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)weights.data();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -28,5 +28,5 @@ END_DEFINE_TARGET()
|
|||
// K210
|
||||
BEGINE_DEFINE_TARGET(k210)
|
||||
DEFINE_RUNTIME_OP(k210, kpu_upload, KPUUpload, 0x2001)
|
||||
DEFINE_RUNTIME_FENCE_OP(k210, kpu_conv2d, KPUConv2D, 0x2002)
|
||||
DEFINE_RUNTIME_OP(k210, kpu_conv2d, KPUConv2D, 0x2002)
|
||||
END_DEFINE_TARGET()
|
||||
|
|
|
@ -23,7 +23,6 @@ namespace runtime
|
|||
#define BEGINE_DEFINE_TARGET(...)
|
||||
#define DEFINE_NEUTRAL_RUNTIME_OP(id, name, value) rop_##id = value,
|
||||
#define DEFINE_RUNTIME_OP(target, id, name, value) rop_##target##_##id = value,
|
||||
#define DEFINE_RUNTIME_FENCE_OP(target, id, name, value) rop_##target##_##id = value,
|
||||
#define END_DEFINE_TARGET()
|
||||
|
||||
enum runtime_opcode : uint32_t
|
||||
|
@ -33,16 +32,12 @@ namespace runtime
|
|||
|
||||
#undef DEFINE_NEUTRAL_RUNTIME_OP
|
||||
#undef DEFINE_RUNTIME_OP
|
||||
#undef DEFINE_RUNTIME_FENCE_OP
|
||||
#define DEFINE_NEUTRAL_RUNTIME_OP(id, name, value) \
|
||||
case rop_##id: \
|
||||
return #name;
|
||||
#define DEFINE_RUNTIME_OP(target, id, name, value) \
|
||||
case rop_##target##_##id: \
|
||||
return #name;
|
||||
#define DEFINE_RUNTIME_FENCE_OP(target, id, name, value) \
|
||||
case rop_##target##_##id: \
|
||||
return #name;
|
||||
|
||||
constexpr std::string_view node_opcode_names(runtime_opcode opcode)
|
||||
{
|
||||
|
@ -57,7 +52,6 @@ namespace runtime
|
|||
#undef BEGINE_DEFINE_TARGET
|
||||
#undef DEFINE_NEUTRAL_RUNTIME_OP
|
||||
#undef DEFINE_RUNTIME_OP
|
||||
#undef DEFINE_RUNTIME_FENCE_OP
|
||||
#undef END_DEFINE_TARGET
|
||||
}
|
||||
}
|
||||
|
|
|
@ -46,9 +46,7 @@ bool interpreter_base::try_load_model(const uint8_t *buffer)
|
|||
node_headers_ = { reinterpret_cast<const node_header *>(offset), nodes_size() };
|
||||
offset += sizeof(node_header) * nodes_size();
|
||||
node_body_start_ = offset;
|
||||
#if !NNCASE_TARGET_K210_SIMULATOR
|
||||
load_first_ = 1;
|
||||
#endif
|
||||
|
||||
return initialize();
|
||||
}
|
||||
|
||||
|
@ -93,7 +91,6 @@ void interpreter_base::step()
|
|||
|
||||
if (cnt_node_ == nodes_size())
|
||||
{
|
||||
load_first_ = 0;
|
||||
run_callback_(userdata_);
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
#include <runtime/kernel_registry.h>
|
||||
#include <runtime/neutral/neutral_ops_body.h>
|
||||
#include <runtime/span_reader.h>
|
||||
#include <cstring>
|
||||
|
||||
using namespace nncase;
|
||||
using namespace nncase::runtime;
|
||||
|
@ -34,8 +33,6 @@ namespace runtime
|
|||
kernel_call_result id(id##_options &, interpreter_t &, interpreter_step_t);
|
||||
#define DEFINE_RUNTIME_OP(target, id, name, value) \
|
||||
kernel_call_result id(id##_options &, interpreter_t &, interpreter_step_t);
|
||||
#define DEFINE_RUNTIME_FENCE_OP(target, id, name, value) \
|
||||
kernel_call_result id(id##_options &, interpreter_t &, interpreter_step_t);
|
||||
|
||||
#define END_DEFINE_TARGET() }
|
||||
|
||||
|
@ -44,7 +41,6 @@ namespace runtime
|
|||
#undef BEGINE_DEFINE_TARGET
|
||||
#undef DEFINE_NEUTRAL_RUNTIME_OP
|
||||
#undef DEFINE_RUNTIME_OP
|
||||
#undef DEFINE_RUNTIME_FENCE_OP
|
||||
#undef END_DEFINE_TARGET
|
||||
}
|
||||
}
|
||||
|
@ -70,21 +66,6 @@ kernel_call_result runtime::call_kernel(runtime_opcode opcode, xtl::span<const u
|
|||
options.deserialize(reader); \
|
||||
return nncase::runtime::target::id(options, interpreter, step); \
|
||||
}
|
||||
#define DEFINE_RUNTIME_FENCE_OP(target, id, name, value) \
|
||||
case rop_##target##_##id: \
|
||||
{ \
|
||||
if(interpreter.load_first_) \
|
||||
{ \
|
||||
for(int i=0; i<body.size(); i++)\
|
||||
{\
|
||||
*((uint8_t *)((uintptr_t)body.data()-0x40000000)+i) = *((uint8_t *)body.data()+i);\
|
||||
}\
|
||||
} \
|
||||
nncase::runtime::target::id##_options options; \
|
||||
options.deserialize(reader); \
|
||||
return nncase::runtime::target::id(options, interpreter, step); \
|
||||
}
|
||||
|
||||
#define END_DEFINE_TARGET()
|
||||
|
||||
#include <runtime/runtime_op.def>
|
||||
|
@ -92,7 +73,6 @@ kernel_call_result runtime::call_kernel(runtime_opcode opcode, xtl::span<const u
|
|||
#undef BEGINE_DEFINE_TARGET
|
||||
#undef DEFINE_NEUTRAL_RUNTIME_OP
|
||||
#undef DEFINE_RUNTIME_OP
|
||||
#undef DEFINE_RUNTIME_FENCE_OP
|
||||
#undef END_DEFINE_TARGET
|
||||
default:
|
||||
return kcr_error;
|
||||
|
|
Loading…
Reference in New Issue