update kpu driver

2019-03-25 11:21:06 +08:00 · 2019-03-25 11:21:06 +08:00 · f296616565
parent b6bcf59355
commit f296616565
3 changed files with 355 additions and 47 deletions
--- a/lib/bsp/device/kpu.cpp
+++ b/lib/bsp/device/kpu.cpp
@ -28,12 +28,13 @@

 using namespace sys;

-#define KPU_DEBUG 1
+#define KPU_DEBUG 0
 #define NNCASE_DEBUG 0
 #define USE_CACHED_AI_RAM 0

 #define min(a, b) (((a) < (b)) ? (a) : (b))
 #define max(a, b) (((a) > (b)) ? (a) : (b))
+#define ALIGN_UP(x, align) ((x + (align - 1)) & (~(align - 1)))

 #define COMMON_ENTRY \
    semaphore_lock locker(free_mutex_);
@ -176,6 +177,7 @@ public:
            }
        }
        done_flag_ = 0;
+        dma_close(dma_ch_);
        return 0;
    }

@ -273,22 +275,51 @@ private:

        if ((uintptr_t)src % 8 == 0 && width % 8 == 0)
        {
+#define UPLOAD_BEGIN()                                                                                               \
+            for (oc = 0; oc < channels; oc++)                                                                                \
+            {                                                                                                                \
+                uint8_t* channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;   \
+                for (y = 0; y < height; y++)                                                                                 \
+                {                                                                                                            \
+                    uint64_t *y_origin = (uint64_t *)channel_origin + y * row_length * 64;                                               \
+
+#define UPLOAD_END() \
+                }            \
+            }
            width /= 8;
            const uint64_t *u64_src = (const uint64_t *)src;
-            for (oc = 0; oc < channels; oc++)
+            if (width == 1)
            {
-            	uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
-            	for (y = 0; y < height; y++)
-            	{
-            		uint64_t *y_origin = (uint64_t *)channel_origin + y * row_length * 64L;
-            		for (x = 0; x < width; x++)
-                    {
+                UPLOAD_BEGIN()
+                    y_origin[0] = *u64_src++;
+                UPLOAD_END()
+            }
+            else if (width == 2)
+            {
+                UPLOAD_BEGIN()
+                {
+                    y_origin[0] = *u64_src++;
+                    y_origin[1] = *u64_src++;
+                }
+                UPLOAD_END()
+            }
+            else if (width == 4)
+            {
+                UPLOAD_BEGIN()
+                {
+                    y_origin[0] = *u64_src++;
+                    y_origin[1] = *u64_src++;
+                    y_origin[2] = *u64_src++;
+                    y_origin[3] = *u64_src++;
+                }
+                UPLOAD_END()
+            }
+            else
+            {
+                UPLOAD_BEGIN()
+                for (x = 0; x < width; x++)
                    y_origin[x] = *u64_src++;
-#if NNCASE_DEBUG
-                    assert(y_origin > AI_IO_BASE_ADDR && y_origin < (AI_IO_BASE_ADDR + 2 * 1024 * 1024));
-#endif
-                }
-                }
+                UPLOAD_END()
            }
        }
        else
@ -322,20 +353,6 @@ private:
        kpu_upload_core(width, height, channels, src, layer->image_addr.data.image_src_addr);
    }

-    void kpu_fully_connected(const float *src, const float *weights, const float *biases, float *dest, int input_channels, int output_channels)
-    {
-        int ic, oc;
-        for (oc = 0; oc < output_channels; oc++)
-        {
-            const float *c_weights = weights + oc * input_channels;
-    
-            float sum = 0.0f;
-            for (ic = 0; ic < input_channels; ic++)
-                sum += src[ic] * c_weights[ic];
-            dest[oc] = sum + biases[oc];
-        }
-    }
-
    void kpu_add(const kpu_model_add_layer_argument_t *arg)
    {
        const float *src_a = (const float *)(ctx_.main_buffer + arg->main_mem_in_a_address);
@ -351,7 +368,7 @@ private:
    {
        const uint8_t *src_a = (const uint8_t *)(ctx_.main_buffer + arg->main_mem_in_a_address);
        const uint8_t *src_b = (const uint8_t *)(ctx_.main_buffer + arg->main_mem_in_b_address);
-        size_t count = arg->count;
+        size_t count = ALIGN_UP(arg->count, 8) / 8;
        int64_t off_a = arg->in_a_offset, mul_a = arg->in_a_mul, sh_a = arg->in_a_shift;
        int64_t off_b = arg->in_b_offset, mul_b = arg->in_b_mul, sh_b = arg->in_b_shift;
        int64_t off_o = arg->out_offset, mul_o = arg->out_mul, sh_o = arg->out_shift;
@ -361,26 +378,132 @@ private:

        if (sh_a == sh_b)
        {
+#define QADD_UNROLL_1(x)     \
+            int64_t a##x = *src_a++; \
+            int64_t b##x = *src_b++;
+        
+#define QADD_UNROLL_2(x) \
+            a##x += off_a; \
+            b##x += off_b;
+        
+#define QADD_UNROLL_3(x) \
+            a##x *= mul_a; \
+            b##x *= mul_b;
+        
+#define QADD_UNROLL_4(x) \
+            int64_t v##x = a##x + b##x;
+        
+#define QADD_UNROLL_5(x) \
+            v##x >>= sh_a;
+        
+#define QADD_UNROLL_6(x) \
+            v##x *= mul_o;
+        
+#define QADD_UNROLL_7(x) \
+            v##x >>= sh_o;
+        
+#define QADD_UNROLL_8(x) \
+            v##x += off_o;
+        
+#define QADD_UNROLL_9(x) \
+            v##x = min(0xFF, max(0, v##x));
+        
+#define QADD_UNROLL_10(x) \
+            *dest++ = v##x;
+        
+#define QADD_UNROLL_S(x) \
+            QADD_UNROLL_##x(0) \
+            QADD_UNROLL_##x(1) \
+            QADD_UNROLL_##x(2) \
+            QADD_UNROLL_##x(3) \
+            QADD_UNROLL_##x(4) \
+            QADD_UNROLL_##x(5) \
+            QADD_UNROLL_##x(6) \
+            QADD_UNROLL_##x(7)
+
            for (i = 0; i < count; i++)
            {
-                int64_t a = (*src_a++ + off_a) * mul_a;
-                int64_t b = (*src_b++ + off_b) * mul_b;
-                int64_t value = (((a + b) >> sh_a) * mul_o >> sh_o) + off_o;
-                if (value < 0) value = 0;
-                if (value > 0xFF) value = 0xFF;
-                *dest++ = (uint8_t)value;
+                QADD_UNROLL_S(1);
+                QADD_UNROLL_S(2);
+                QADD_UNROLL_S(3);
+                QADD_UNROLL_S(4);
+                QADD_UNROLL_S(5);
+                QADD_UNROLL_S(6);
+                QADD_UNROLL_S(7);
+                QADD_UNROLL_S(8);
+                QADD_UNROLL_S(9);
+                QADD_UNROLL_S(10);
            }
        }
        else
        {
+#undef QADD_UNROLL_1
+#define QADD_UNROLL_1(x)     \
+            int64_t a##x = *src_a++; \
+            int64_t b##x = *src_b++;
+
+#undef QADD_UNROLL_2
+#define QADD_UNROLL_2(x) \
+            a##x += off_a; \
+            b##x += off_b;
+
+#undef QADD_UNROLL_3
+#define QADD_UNROLL_3(x) \
+            a##x *= mul_a; \
+            b##x *= mul_b;
+
+#undef QADD_UNROLL_4
+#define QADD_UNROLL_4(x) \
+            a##x >>= sh_a; \
+            b##x >>= sh_b;
+
+#undef QADD_UNROLL_5
+#define QADD_UNROLL_5(x) \
+            int64_t v##x = a##x + b##x;
+
+#undef QADD_UNROLL_6
+#define QADD_UNROLL_6(x) \
+            v##x *= mul_o;
+
+#undef QADD_UNROLL_7
+#define QADD_UNROLL_7(x) \
+            v##x >>= sh_o;
+
+#undef QADD_UNROLL_8
+#define QADD_UNROLL_8(x) \
+            v##x += off_o;
+
+#undef QADD_UNROLL_9
+#define QADD_UNROLL_9(x) \
+            v##x = min(0xFF, max(0, v##x));
+
+#undef QADD_UNROLL_10
+#define QADD_UNROLL_10(x) \
+            *dest++ = v##x;
+
+#undef QADD_UNROLL_S
+#define QADD_UNROLL_S(x) \
+            QADD_UNROLL_##x(0) \
+            QADD_UNROLL_##x(1) \
+            QADD_UNROLL_##x(2) \
+            QADD_UNROLL_##x(3) \
+            QADD_UNROLL_##x(4) \
+            QADD_UNROLL_##x(5) \
+            QADD_UNROLL_##x(6) \
+            QADD_UNROLL_##x(7)
+
            for (i = 0; i < count; i++)
            {
-                int64_t a = (*src_a++ + off_a) * mul_a >> sh_a;
-                int64_t b = (*src_b++ + off_b) * mul_b >> sh_b;
-                int64_t value = ((a + b) * mul_o >> sh_o) + off_o;
-                if (value < 0) value = 0;
-                if (value > 0xFF) value = 0xFF;
-                *dest++ = (uint8_t)value;
+                QADD_UNROLL_S(1);
+                QADD_UNROLL_S(2);
+                QADD_UNROLL_S(3);
+                QADD_UNROLL_S(4);
+                QADD_UNROLL_S(5);
+                QADD_UNROLL_S(6);
+                QADD_UNROLL_S(7);
+                QADD_UNROLL_S(8);
+                QADD_UNROLL_S(9);
+                QADD_UNROLL_S(10);
            }
        }
    }
@ -445,6 +568,51 @@ private:
        }
    }

+    void kpu_average_pool2d(const kpu_model_ave_pool2d_layer_argument_t *arg)
+    {
+        const float *src = (const float *)(ctx_.main_buffer + arg->main_mem_in_address);
+        float *dest = (float *)(ctx_.main_buffer + arg->main_mem_out_address);
+        kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
+        uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
+        uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
+        uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
+    
+        uint32_t out_y, out_x, oc;
+    
+        for (oc = 0; oc < out_shape.channels; oc++)
+        {
+            const float *channel_src = src + in_shape.width * in_shape.height * oc;
+            for (out_y = 0; out_y < out_shape.height; out_y++)
+            {
+                for (out_x = 0; out_x < out_shape.width; out_x++)
+                {
+                    int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
+                    int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
+                    int32_t kernel_x_start = max(0, -in_x_origin);
+                    int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
+                    int32_t kernel_y_start = max(0, -in_y_origin);
+                    int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
+                    float value = 0;
+                    float kernel_count = 0;
+    
+                    int32_t kernel_y, kernel_x;
+                    for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
+                    {
+                        for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
+                        {
+                            int32_t in_x = in_x_origin + kernel_x;
+                            int32_t in_y = in_y_origin + kernel_y;
+                            value += channel_src[in_y * in_shape.width + in_x];
+                            kernel_count++;
+                        }
+                    }
+    
+                    *dest++ = value / kernel_count;
+                }
+            }
+        }
+    }
+
    void kpu_quantize(const kpu_model_quantize_layer_argument_t *arg)
    {
        size_t count = arg->count;
@ -478,12 +646,21 @@ private:
    {
        const uint8_t *src = (const uint8_t *)(ctx_.main_buffer + arg->main_mem_in_address);
        uint8_t *dest = (uint8_t *)(ctx_.main_buffer + arg->main_mem_out_address);
-        size_t oc, count = arg->count;
+        size_t oc, count = ALIGN_UP(arg->count, 8) / 8;
        const uint8_t *table = arg->table;
        
-        for (oc = 0; oc < count; oc++)
-            dest[oc] = table[*src++];
-    }
+		for (oc = 0; oc < count;)
+	    {
+			dest[oc++] = table[*src++];
+			dest[oc++] = table[*src++];
+			dest[oc++] = table[*src++];
+			dest[oc++] = table[*src++];
+			dest[oc++] = table[*src++];
+			dest[oc++] = table[*src++];
+			dest[oc++] = table[*src++];
+			dest[oc++] = table[*src++];
+	    }
+	}

    void kpu_l2_normalization(const kpu_model_l2_norm_layer_argument_t *arg)
    {
@ -538,6 +715,64 @@ private:
        }
    }

+    void kpu_fully_connected(const kpu_model_fully_connected_layer_argument_t *arg)
+    {
+        const float *src = (const float *)(ctx_.main_buffer + arg->main_mem_in_address);
+        float *dest = (float *)(ctx_.main_buffer + arg->main_mem_out_address);
+        uint32_t in_channels = arg->in_channels, out_channels = arg->out_channels, ic, oc;
+        const float *weights = arg->weights, *bias = arg->weights + in_channels * out_channels;
+
+        for (oc = 0; oc < out_channels; oc++)
+        {
+            const float *c_weights = weights + oc * in_channels;
+
+            float sum = 0.0f;
+            for (ic = 0; ic < in_channels; ic++)
+                sum += src[ic] * c_weights[ic];
+            dest[oc] = sum + bias[oc];
+        }
+    }
+
+    void kpu_tf_flatten(const kpu_model_tf_flatten_layer_argument_t *arg)
+    {
+        const float *src = (const float *)(ctx_.main_buffer + arg->main_mem_in_address);
+        float *dest = (float *)(ctx_.main_buffer + arg->main_mem_out_address);
+        kpu_model_shape_t in_shape = arg->shape;
+        uint32_t oc, oy, ox;
+    
+        for (oy = 0; oy < in_shape.height; oy++)
+            for (ox = 0; ox < in_shape.width; ox++)
+                for (oc = 0; oc < in_shape.channels; oc++)
+                    *dest++ = src[(oc * in_shape.height + oy) * in_shape.width + ox];
+    }
+
+    void kpu_resize_nearest_neighbor(const kpu_model_resize_nearest_neighbor_layer_argument_t *arg)
+    {
+        const float *src = (const float *)(ctx_.main_buffer + arg->main_mem_in_address);
+        float *dest = (float *)(ctx_.main_buffer + arg->main_mem_out_address);
+        kpu_model_shape_t in_shape = arg->in_shape;
+        uint32_t out_width = arg->out_width, out_height = arg->out_height;
+        uint32_t oc, oy, ox;
+    
+        float height_scale = (float)in_shape.height / out_height;
+        float width_scale = (float)in_shape.width / out_width;
+    
+        for (oc = 0; oc < in_shape.channels; oc++)
+        {
+            const float *channel_src = src + in_shape.width * in_shape.height * oc;
+            for (oy = 0; oy <out_height; oy++)
+            {
+                uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
+                const float *y_origin = channel_src + in_y * in_shape.width;
+                for (ox = 0; ox < out_width; ox++)
+                {
+                    uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
+                    *dest++ = y_origin[in_x];
+                }
+            }
+        }
+    }
+
    void kpu_conv(const kpu_model_conv_layer_argument_t *arg)
    {
        volatile kpu_layer_argument_t layer = *(kpu_layer_argument_t *)(ctx_.model_buffer + arg->layer_offset);
@ -550,7 +785,7 @@ private:
            uint8_t *dest = ctx_.main_buffer + arg->main_mem_out_address;

            layer.dma_parameter.data.send_data_out = 1;
-
+            kpu_send_layer((const kpu_layer_argument_t *)&layer);
            dma_set_request_source(dma_ch_, dma_req_);
            dma_transmit_async(dma_ch_, (void *)(&kpu_.fifo_data_out), dest, 0, 1, sizeof(uint64_t), (layer.dma_parameter.data.dma_total_byte + 8) / 8, 8, completion_event_);
        }
@ -568,8 +803,9 @@ private:
            kpu_.interrupt_mask.data.layer_cfg_almost_full_int = 1;
            kpu_.interrupt_mask.data.reserved = 0;
 #endif
+            kpu_send_layer((const kpu_layer_argument_t *)&layer);
        }
-        kpu_send_layer((const kpu_layer_argument_t *)&layer);
+        
    }

    void kpu_add_padding(const kpu_model_add_padding_layer_argument_t *arg)
@ -636,6 +872,8 @@ private:
                return "GAP";
            case KL_QUANTIZED_MAX_POOL2D:
                return "QuantMaxPool2d";
+            case KL_AVERAGE_POOL2D:
+                return "AveragePool2d";
            case KL_QUANTIZE:
                return "Quantize";
            case KL_DEQUANTIZE:
@ -650,6 +888,12 @@ private:
                return "Concat";
            case KL_QUANTIZED_CONCAT:
                return "QuantConcat";
+            case KL_FULLY_CONNECTED:
+                return "FullyConnected";
+            case KL_TENSORFLOW_FLATTEN:
+                return "TFFlatten";
+            case KL_RESIZE_NEAREST_NEIGHBOR:
+                return "ResizeNearestNeighbor";
            case KL_K210_CONV:
                return "K210Conv";
            case KL_K210_ADD_PADDING:
@ -726,6 +970,9 @@ private:
            case KL_QUANTIZED_MAX_POOL2D:
                kpu_quantized_max_pool2d((const kpu_model_quant_max_pool2d_layer_argument_t *)layer_body);
                break;
+            case KL_AVERAGE_POOL2D:
+                kpu_average_pool2d((const kpu_model_ave_pool2d_layer_argument_t *)layer_body);
+                break;
            case KL_QUANTIZE:
                kpu_quantize((const kpu_model_quantize_layer_argument_t *)layer_body);
                break;
@ -745,6 +992,15 @@ private:
            case KL_QUANTIZED_CONCAT:
                kpu_concat((const kpu_model_concat_layer_argument_t *)layer_body);
                break;
+            case KL_FULLY_CONNECTED:
+                kpu_fully_connected((const kpu_model_fully_connected_layer_argument_t *)layer_body);
+                break;
+            case KL_TENSORFLOW_FLATTEN:
+                kpu_tf_flatten((const kpu_model_tf_flatten_layer_argument_t *)layer_body);
+                break;
+            case KL_RESIZE_NEAREST_NEIGHBOR:
+                kpu_resize_nearest_neighbor((const kpu_model_resize_nearest_neighbor_layer_argument_t *)layer_body);
+                break;
            case KL_K210_CONV:
                kpu_conv((const kpu_model_conv_layer_argument_t *)layer_body);
                return 0;
--- a/lib/bsp/entry_user.c
+++ b/lib/bsp/entry_user.c
@ -24,7 +24,7 @@
 #include <sysctl.h>
 #include <uarths.h>

-#define PLL1_OUTPUT_FREQ 160000000UL
+#define PLL1_OUTPUT_FREQ 400000000UL
 #define PLL2_OUTPUT_FREQ 45158400UL

 extern uint8_t _tls_data[];
--- a/lib/hal/include/kpu.h
+++ b/lib/hal/include/kpu.h
@ -359,6 +359,12 @@ typedef enum
    KL_SOFTMAX,
    KL_CONCAT,
    KL_QUANTIZED_CONCAT,
+    KL_FULLY_CONNECTED,
+    KL_QUANTIZED_FULLY_CONNECTED,
+    KL_TENSORFLOW_FLATTEN,
+    KL_QUANTIZED_TENSORFLOW_FLATTEN,
+    KL_RESIZE_NEAREST_NEIGHBOR,
+    KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR,
    KL_K210_CONV = 10240,
    KL_K210_ADD_PADDING,
    KL_K210_REMOVE_PADDING,
@ -470,6 +476,22 @@ typedef struct
    uint32_t padding_height;
 } kpu_model_quant_max_pool2d_layer_argument_t;

+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t main_mem_out_address;
+    kpu_model_shape_t in_shape;
+    kpu_model_shape_t out_shape;
+    uint32_t kernel_width;
+    uint32_t kernel_height;
+    uint32_t stride_width;
+    uint32_t stride_height;
+    uint32_t padding_width;
+    uint32_t padding_height;
+    kpu_model_activation_t act;
+} kpu_model_ave_pool2d_layer_argument_t;
+
 typedef struct
 {
    uint32_t flags;
@ -547,6 +569,36 @@ typedef struct
    kpu_model_memory_range_t inputs_mem[0];
 } kpu_model_concat_layer_argument_t;

+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t main_mem_out_address;
+    uint32_t in_channels;
+    uint32_t out_channels;
+    kpu_model_activation_t act;
+    float weights[0];
+} kpu_model_fully_connected_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t main_mem_out_address;
+    kpu_model_shape_t shape;
+} kpu_model_tf_flatten_layer_argument_t;
+
+typedef struct
+{
+    uint32_t flags;
+    uint32_t main_mem_in_address;
+    uint32_t main_mem_out_address;
+    kpu_model_shape_t in_shape;
+    uint32_t out_width;
+    uint32_t out_height;
+    uint32_t align_corners;
+} kpu_model_resize_nearest_neighbor_layer_argument_t;
+
 typedef struct
 {
    const uint8_t *model_buffer;