Flush kmodel data from CACHE to IOMEM.
parent
4e53f2abf4
commit
d60910fc00
|
@ -20,6 +20,7 @@
|
|||
#include "dmac.h"
|
||||
|
||||
#define kpu_matmul_begin kpu_conv2d_output
|
||||
#define IOMEM 0x40000000
|
||||
|
||||
typedef int (*plic_irq_callback_t)(void *ctx);
|
||||
|
||||
|
|
|
@ -1006,12 +1006,8 @@ static void kpu_quantize(const kpu_model_quantize_layer_argument_t *arg, kpu_mod
|
|||
size_t count = arg->count;
|
||||
const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
||||
|
||||
kpu_model_quant_param_t q;
|
||||
#if FIX_CACHE
|
||||
memcpy(&q, &arg->quant_param, sizeof(kpu_model_quant_param_t));
|
||||
#else
|
||||
q = arg->quant_param;
|
||||
#endif
|
||||
kpu_model_quant_param_t q = arg->quant_param;
|
||||
|
||||
float scale = 1.f / q.scale;
|
||||
|
||||
uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->mem_out_address);
|
||||
|
@ -1032,12 +1028,8 @@ static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t *a
|
|||
const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
||||
float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
||||
size_t oc, count = arg->count;
|
||||
kpu_model_quant_param_t q;
|
||||
#if FIX_CACHE
|
||||
memcpy(&q, &arg->quant_param, sizeof(kpu_model_quant_param_t));
|
||||
#else
|
||||
q = arg->quant_param;
|
||||
#endif
|
||||
kpu_model_quant_param_t q = arg->quant_param;
|
||||
|
||||
for(oc = 0; oc < count; oc++)
|
||||
dest[oc] = *src++ * q.scale + q.bias;
|
||||
}
|
||||
|
@ -1277,9 +1269,9 @@ static void kpu_logistic(const kpu_model_logistic_layer_argument_t *arg, kpu_mod
|
|||
static void kpu_conv(const kpu_model_conv_layer_argument_t *arg, kpu_model_context_t *ctx)
|
||||
{
|
||||
volatile kpu_layer_argument_t layer = *(const volatile kpu_layer_argument_t *)(ctx->model_buffer + arg->layer_offset);
|
||||
layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset);
|
||||
layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset);
|
||||
layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset);
|
||||
layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - IOMEM;
|
||||
layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - IOMEM;
|
||||
layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - IOMEM;
|
||||
|
||||
if(arg->flags & KLF_MAIN_MEM_OUT)
|
||||
{
|
||||
|
@ -1372,7 +1364,7 @@ static void kpu_upload(const kpu_model_upload_layer_argument_t *arg, kpu_model_c
|
|||
int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
|
||||
{
|
||||
#if FIX_CACHE
|
||||
configASSERT(!is_memory_cache((uintptr_t)buffer));
|
||||
configASSERT(is_memory_cache((uintptr_t)buffer));
|
||||
#endif
|
||||
uintptr_t base_addr = (uintptr_t)buffer;
|
||||
const kpu_kmodel_header_t *header = (const kpu_kmodel_header_t *)buffer;
|
||||
|
@ -1389,6 +1381,20 @@ int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
|
|||
ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage);
|
||||
if(!ctx->main_buffer)
|
||||
return -1;
|
||||
uint32_t body_size = 0;
|
||||
for(int i=0; i<ctx->layers_length; i++)
|
||||
{
|
||||
const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + i;
|
||||
body_size += cnt_layer_header->body_size;
|
||||
}
|
||||
uint8_t *body_start_iomem = (uint8_t *)((uintptr_t)ctx->body_start - IOMEM);
|
||||
const uint8_t *body_start_cache = ctx->body_start;
|
||||
memcpy(body_start_iomem, body_start_cache, body_size);
|
||||
for(int i=0; i<body_size; i++)
|
||||
{
|
||||
configASSERT(body_start_iomem[i] == body_start_cache[i]);
|
||||
}
|
||||
|
||||
} else if(header->version == 'KMDL')
|
||||
{
|
||||
return nncase_load_kmodel(ctx, buffer);
|
||||
|
|
|
@ -17,8 +17,6 @@
|
|||
#include <cmath>
|
||||
#include <runtime/runtime_op_utility.h>
|
||||
#include <xtl/xspan.hpp>
|
||||
#include <cstring>
|
||||
#include <utils.h>
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
|
@ -42,12 +40,7 @@ namespace kernels
|
|||
const auto in_a_off = kernels::details::get_reduced_offset(in_off, in_a_shape);
|
||||
const auto in_b_off = kernels::details::get_reduced_offset(in_off, in_b_shape);
|
||||
const auto a = input_a[offset(in_a_shape, in_a_off)];
|
||||
#if FIX_CACHE
|
||||
float b;
|
||||
memcpy(&b, &input_b[offset(in_b_shape, in_b_off)], sizeof(float));
|
||||
#else
|
||||
const auto b = input_b[offset(in_b_shape, in_b_off)];
|
||||
#endif
|
||||
output[offset(out_shape, in_off)] = kernels::details::apply_activation(op(a, b), fused_activation);
|
||||
}
|
||||
}
|
||||
|
@ -79,17 +72,6 @@ namespace kernels
|
|||
const auto g_ic = in_shape[1] / groups;
|
||||
const auto g_oc = out_channels / groups;
|
||||
|
||||
#if FIX_CACHE
|
||||
float *cache_weights = new float[(size_t)out_channels * in_shape[1] / groups * filter_h * filter_w];
|
||||
float *cache_bias = new float[out_channels];
|
||||
|
||||
memcpy(cache_weights, weights, (size_t)out_channels * in_shape[1] / groups * filter_h * filter_w*sizeof(float));
|
||||
memcpy(cache_bias, bias, out_channels*sizeof(float));
|
||||
#else
|
||||
const float *cache_weights = weights;
|
||||
const float *cache_bias = bias;
|
||||
#endif
|
||||
|
||||
for (int32_t batch = 0; batch < in_shape[0]; batch++)
|
||||
{
|
||||
const float *in_batch_p = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
|
||||
|
@ -97,7 +79,7 @@ namespace kernels
|
|||
for (int32_t og = 0; og < groups; og++)
|
||||
{
|
||||
const float *in_group_p = in_batch_p + (size_t)og * g_ic * in_shape[2] * in_shape[3];
|
||||
const float *w_group_p = cache_weights + (size_t)og * g_oc * g_ic * filter_h * filter_w;
|
||||
const float *w_group_p = weights + (size_t)og * g_oc * g_ic * filter_h * filter_w;
|
||||
|
||||
for (int32_t oc = 0; oc < g_oc; oc++)
|
||||
{
|
||||
|
@ -113,7 +95,7 @@ namespace kernels
|
|||
const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
|
||||
const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
|
||||
const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
|
||||
float value = cache_bias[og * g_oc + oc];
|
||||
float value = bias[og * g_oc + oc];
|
||||
|
||||
for (int32_t ic = 0; ic < g_ic; ic++)
|
||||
{
|
||||
|
@ -141,10 +123,6 @@ namespace kernels
|
|||
}
|
||||
}
|
||||
}
|
||||
#if FIX_CACHE
|
||||
delete []cache_weights;
|
||||
delete []cache_bias;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class TQ>
|
||||
|
@ -160,17 +138,11 @@ namespace kernels
|
|||
|
||||
inline void matmul(const float *input_a, const float *input_b, float *output, const float *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, const value_range<float> &fused_activation)
|
||||
{
|
||||
#if FIX_CACHE
|
||||
float *cache_mem = new float[b_cols];
|
||||
memcpy(cache_mem, bias, b_cols*sizeof(float));
|
||||
#else
|
||||
const float *cache_mem =bias;
|
||||
#endif
|
||||
for (size_t oy = 0; oy < a_rows; oy++)
|
||||
{
|
||||
for (size_t ox = 0; ox < b_cols; ox++)
|
||||
{
|
||||
float value = cache_mem[ox];
|
||||
float value = bias[ox];
|
||||
|
||||
for (size_t i = 0; i < a_cols; i++)
|
||||
{
|
||||
|
@ -182,9 +154,6 @@ namespace kernels
|
|||
output[oy * b_cols + ox] = details::apply_activation(value, fused_activation);
|
||||
}
|
||||
}
|
||||
#if FIX_CACHE
|
||||
delete []cache_mem;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class T>
|
||||
|
|
|
@ -35,6 +35,7 @@ namespace runtime
|
|||
|
||||
public:
|
||||
bool try_load_model(const uint8_t *buffer);
|
||||
uint32_t model_size(const uint8_t *buffer);
|
||||
|
||||
size_t inputs_size() const noexcept { return model_header_->inputs; }
|
||||
size_t outputs_size() const noexcept { return model_header_->outputs; }
|
||||
|
|
|
@ -61,9 +61,9 @@ namespace runtime
|
|||
reader.skip(layer.kernel_load_cfg.data.para_start_addr);
|
||||
reader.read_span(weights, weights_size);
|
||||
#if !NNCASE_TARGET_K210_SIMULATOR
|
||||
layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)batch_norm.data();
|
||||
layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)activation;
|
||||
layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)weights.data();
|
||||
layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)batch_norm.data() - IOMEM;
|
||||
layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)activation - IOMEM;
|
||||
layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)weights.data() - IOMEM;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include <kernels/k210/k210_kernels.h>
|
||||
#include <runtime/target_interpreter.h>
|
||||
#include <stdio.h>
|
||||
#include <cstring>
|
||||
|
||||
using namespace nncase;
|
||||
using namespace nncase::runtime;
|
||||
|
@ -38,7 +39,22 @@ class nncase_context
|
|||
public:
|
||||
int load_kmodel(const uint8_t *buffer)
|
||||
{
|
||||
return interpreter_.try_load_model(buffer) ? 0 : -1;
|
||||
int ret = interpreter_.try_load_model(buffer) ? 0 : -1;
|
||||
|
||||
uint32_t size = interpreter_.model_size(buffer);
|
||||
uint8_t *buffer_iomem = (uint8_t *)((uintptr_t)buffer - IOMEM);
|
||||
const uint8_t *buffer_cache = buffer;
|
||||
memcpy(buffer_iomem, buffer_cache, size);
|
||||
for(int i=0; i<size; i++)
|
||||
{
|
||||
if(buffer_iomem[i] != buffer_cache[i])
|
||||
{
|
||||
printf("flush model fail:%d %x %x \n", i, buffer_iomem[i], buffer_cache[i]);
|
||||
while(1)
|
||||
;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int get_output(uint32_t index, uint8_t **data, size_t *size)
|
||||
|
|
|
@ -50,6 +50,17 @@ bool interpreter_base::try_load_model(const uint8_t *buffer)
|
|||
return initialize();
|
||||
}
|
||||
|
||||
uint32_t interpreter_base::model_size(const uint8_t *buffer)
|
||||
{
|
||||
uint32_t size = (uint32_t)(node_body_start_ - buffer);
|
||||
for(int i=0; i<nodes_size(); i++)
|
||||
{
|
||||
struct node_header cnt_layer_header = node_headers_[i];;
|
||||
size += cnt_layer_header.body_size;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
bool interpreter_base::initialize()
|
||||
{
|
||||
return true;
|
||||
|
|
Loading…
Reference in New Issue