Update nncase : fence kpu data

2019-10-25 12:00:18 +08:00 · 2019-10-25 12:00:18 +08:00 · e792535609
parent 9a6cbb6f41
commit e792535609
9 changed files with 43 additions and 41 deletions
--- a/lib/drivers/include/kpu.h
+++ b/lib/drivers/include/kpu.h
@ -681,7 +681,6 @@ typedef struct
            dmac_channel_number_t dma_ch;
            kpu_done_callback_t done_callback;
            volatile uint8_t load_first;
-            volatile uint8_t is_memory_cache;
            void *userdata;
        };

--- a/lib/drivers/kpu.c
+++ b/lib/drivers/kpu.c
@ -1006,12 +1006,8 @@ static void kpu_quantize(const kpu_model_quantize_layer_argument_t *arg, kpu_mod
    size_t count = arg->count;
    const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);

-    kpu_model_quant_param_t q;
-#if FIX_CACHE
-    memcpy(&q, &arg->quant_param, sizeof(kpu_model_quant_param_t));
-#else
-    q = arg->quant_param;
-#endif
+    kpu_model_quant_param_t q = arg->quant_param;
+
    float scale = 1.f / q.scale;

    uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->mem_out_address);
@ -1032,12 +1028,8 @@ static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t *a
    const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
    float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
    size_t oc, count = arg->count;
-    kpu_model_quant_param_t q;
-#if FIX_CACHE
-    memcpy(&q, &arg->quant_param, sizeof(kpu_model_quant_param_t));
-#else
-    q = arg->quant_param;
-#endif
+    kpu_model_quant_param_t q = arg->quant_param;
+
    for(oc = 0; oc < count; oc++)
        dest[oc] = *src++ * q.scale + q.bias;
 }
@ -1273,14 +1265,10 @@ static void kpu_logistic(const kpu_model_logistic_layer_argument_t *arg, kpu_mod
 static void kpu_conv(const kpu_model_conv_layer_argument_t *arg, kpu_model_context_t *ctx)
 {
    volatile kpu_layer_argument_t layer = *(const volatile kpu_layer_argument_t *)(ctx->model_buffer + arg->layer_offset);
-    uintptr_t fix = 0;
-    if(ctx->is_memory_cache)
-    {
-        fix = 0x40000000;
-    }
-    layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - fix;
-    layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - fix;
-    layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - fix;
+
+    layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - 0x40000000;
+    layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - 0x40000000;
+    layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - 0x40000000;

    if(arg->flags & KLF_MAIN_MEM_OUT)
    {
@ -1375,14 +1363,11 @@ int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
    uintptr_t base_addr = (uintptr_t)buffer;
    const kpu_kmodel_header_t *header = (const kpu_kmodel_header_t *)buffer;

-    if(is_memory_cache((uintptr_t)buffer))
-    {
-        ctx->load_first = 1;
-        ctx->is_memory_cache = 1;
-    }
+    configASSERT(is_memory_cache((uintptr_t)buffer))

    if(header->version == 3 && header->arch == 0)
    {
+        ctx->load_first = 1;
        ctx->is_nncase = 0;
        ctx->model_buffer = buffer;
        ctx->output_count = header->output_count;
--- a/lib/nncase/include/kernels/neutral/neutral_kernels.h
+++ b/lib/nncase/include/kernels/neutral/neutral_kernels.h
@ -141,17 +141,11 @@ namespace kernels

        inline void matmul(const float *input_a, const float *input_b, float *output, const float *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, const value_range<float> &fused_activation)
        {
-#if FIX_CACHE
-            float *cache_mem = new float[b_cols];
-            memcpy(cache_mem, bias, b_cols*sizeof(float));
-#else
-            const float *cache_mem =bias;
-#endif
            for (size_t oy = 0; oy < a_rows; oy++)
            {
                for (size_t ox = 0; ox < b_cols; ox++)
                {
-                    float value = cache_mem[ox];
+                    float value = bias[ox];

                    for (size_t i = 0; i < a_cols; i++)
                    {
@ -163,9 +157,6 @@ namespace kernels
                    output[oy * b_cols + ox] = details::apply_activation(value, fused_activation);
                }
            }
-#if FIX_CACHE
-            delete []cache_mem;
-#endif
        }

        template <class T>
--- a/lib/nncase/include/runtime/interpreter.h
+++ b/lib/nncase/include/runtime/interpreter.h
@ -55,6 +55,9 @@ namespace runtime

        void run(run_callback_t callback, error_callback_t on_error, node_profile_callback_t node_profile, void *userdata);

+    public:
+        uint8_t load_first_;
+
    protected:
        virtual bool initialize();
        virtual xtl::span<uint8_t> memory_at(const memory_range &range) const noexcept;
--- a/lib/nncase/include/runtime/k210/k210_ops_body.h
+++ b/lib/nncase/include/runtime/k210/k210_ops_body.h
@ -61,9 +61,9 @@ namespace runtime
                reader.skip(layer.kernel_load_cfg.data.para_start_addr);
                reader.read_span(weights, weights_size);
 #if !NNCASE_TARGET_K210_SIMULATOR
-                layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)batch_norm.data();
-                layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)activation;
-                layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)weights.data();
+                layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)batch_norm.data() - 0x40000000;
+                layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)activation - 0x40000000;
+                layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)weights.data() - 0x40000000;
 #endif
            }

--- a/lib/nncase/include/runtime/runtime_op.def
+++ b/lib/nncase/include/runtime/runtime_op.def
@ -28,5 +28,5 @@ END_DEFINE_TARGET()
 // K210
 BEGINE_DEFINE_TARGET(k210)
     DEFINE_RUNTIME_OP(k210, kpu_upload,    KPUUpload,  0x2001)
-     DEFINE_RUNTIME_OP(k210, kpu_conv2d,    KPUConv2D,  0x2002)
+     DEFINE_RUNTIME_FENCE_OP(k210, kpu_conv2d,    KPUConv2D,  0x2002)
 END_DEFINE_TARGET()
--- a/lib/nncase/include/runtime/runtime_op.h
+++ b/lib/nncase/include/runtime/runtime_op.h
@ -23,6 +23,7 @@ namespace runtime
 #define BEGINE_DEFINE_TARGET(...)
 #define DEFINE_NEUTRAL_RUNTIME_OP(id, name, value) rop_##id = value,
 #define DEFINE_RUNTIME_OP(target, id, name, value) rop_##target##_##id = value,
+#define DEFINE_RUNTIME_FENCE_OP(target, id, name, value) rop_##target##_##id = value,
 #define END_DEFINE_TARGET()

    enum runtime_opcode : uint32_t
@ -32,12 +33,16 @@ namespace runtime

 #undef DEFINE_NEUTRAL_RUNTIME_OP
 #undef DEFINE_RUNTIME_OP
+#undef DEFINE_RUNTIME_FENCE_OP
 #define DEFINE_NEUTRAL_RUNTIME_OP(id, name, value) \
    case rop_##id:                                 \
        return #name;
 #define DEFINE_RUNTIME_OP(target, id, name, value) \
    case rop_##target##_##id:                      \
        return #name;
+#define DEFINE_RUNTIME_FENCE_OP(target, id, name, value) \
+    case rop_##target##_##id:                      \
+        return #name;

    constexpr std::string_view node_opcode_names(runtime_opcode opcode)
    {
@ -52,6 +57,7 @@ namespace runtime
 #undef BEGINE_DEFINE_TARGET
 #undef DEFINE_NEUTRAL_RUNTIME_OP
 #undef DEFINE_RUNTIME_OP
+#undef DEFINE_RUNTIME_FENCE_OP
 #undef END_DEFINE_TARGET
 }
 }
--- a/lib/nncase/runtime/interpreter.cpp
+++ b/lib/nncase/runtime/interpreter.cpp
@ -46,7 +46,9 @@ bool interpreter_base::try_load_model(const uint8_t *buffer)
    node_headers_ = { reinterpret_cast<const node_header *>(offset), nodes_size() };
    offset += sizeof(node_header) * nodes_size();
    node_body_start_ = offset;
-
+#if !NNCASE_TARGET_K210_SIMULATOR
+    load_first_ = 1;
+#endif
    return initialize();
 }

@ -91,6 +93,7 @@ void interpreter_base::step()

        if (cnt_node_ == nodes_size())
        {
+            load_first_ = 0;
            run_callback_(userdata_);
            break;
        }
--- a/lib/nncase/runtime/kernel_registry.cpp
+++ b/lib/nncase/runtime/kernel_registry.cpp
@ -17,6 +17,7 @@
 #include <runtime/kernel_registry.h>
 #include <runtime/neutral/neutral_ops_body.h>
 #include <runtime/span_reader.h>
+#include <cstring>

 using namespace nncase;
 using namespace nncase::runtime;
@ -33,6 +34,8 @@ namespace runtime
    kernel_call_result id(id##_options &, interpreter_t &, interpreter_step_t);
 #define DEFINE_RUNTIME_OP(target, id, name, value) \
    kernel_call_result id(id##_options &, interpreter_t &, interpreter_step_t);
+#define DEFINE_RUNTIME_FENCE_OP(target, id, name, value) \
+    kernel_call_result id(id##_options &, interpreter_t &, interpreter_step_t);

 #define END_DEFINE_TARGET() }

@ -41,6 +44,7 @@ namespace runtime
 #undef BEGINE_DEFINE_TARGET
 #undef DEFINE_NEUTRAL_RUNTIME_OP
 #undef DEFINE_RUNTIME_OP
+#undef DEFINE_RUNTIME_FENCE_OP
 #undef END_DEFINE_TARGET
 }
 }
@ -66,6 +70,16 @@ kernel_call_result runtime::call_kernel(runtime_opcode opcode, xtl::span<const u
        options.deserialize(reader);                                    \
        return nncase::runtime::target::id(options, interpreter, step); \
    }
+#define DEFINE_RUNTIME_FENCE_OP(target, id, name, value)                      \
+    case rop_##target##_##id:                                           \
+    {                                                                   \
+        if(interpreter.load_first_)                                     \
+            memcpy((void *)((uintptr_t)(body.data())-0x40000000), body.data(), body.size());\
+        nncase::runtime::target::id##_options options;                  \
+        options.deserialize(reader);                                    \
+        return nncase::runtime::target::id(options, interpreter, step); \
+    }
+
 #define END_DEFINE_TARGET()

 #include <runtime/runtime_op.def>
@ -73,6 +87,7 @@ kernel_call_result runtime::call_kernel(runtime_opcode opcode, xtl::span<const u
 #undef BEGINE_DEFINE_TARGET
 #undef DEFINE_NEUTRAL_RUNTIME_OP
 #undef DEFINE_RUNTIME_OP
+#undef DEFINE_RUNTIME_FENCE_OP
 #undef END_DEFINE_TARGET
    default:
        return kcr_error;