Add kpu functions

feature/kpu
Guo Hui 2018-11-23 20:56:02 +08:00
parent 8f7723a8b9
commit 655c77623d
3 changed files with 160 additions and 23 deletions

View File

@ -15,11 +15,12 @@ add_compile_flags(LD
add_compile_flags(BOTH
-mcmodel=medany
-fno-common
-march=rv64imafdc
-ffunction-sections
-fdata-sections
-fstrict-volatile-bitfields
-fno-zero-initialized-in-bss
-Os
-O3
-ggdb
)

View File

@ -365,4 +365,16 @@ uint8_t *kpu_get_output_buf(kpu_task_t* task);
*/
void kpu_release_output_buf(uint8_t *output_buf);
typedef struct _quantize_param
{
float scale;
float bias;
} quantize_param_t;
void kpu_init(int eight_bit_mode, plic_irq_callback_t callback, void *userdata);
void kpu_input_dma(kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata);
void kpu_conv2d(kpu_layer_argument_t *layer, int stride);
void kpu_conv2d_output(kpu_layer_argument_t *layer, int stride, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata);
void kpu_add(const uint8_t *src1, const quantize_param_t *src1_param, const uint8_t *src2, const quantize_param_t *src2_param, int width, int height, int channels, uint8_t *dest, const quantize_param_t *dest_param);
#endif

View File

@ -3,6 +3,7 @@
#include <sysctl.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "printf.h"
#include "dmac.h"
#include <string.h>
@ -32,18 +33,18 @@ int kpu_continue(void* _task)
int layer_burst_size = 1;
kpu->interrupt_clear.data = (kpu_config_interrupt_t)
{
.calc_done_int=1,
.layer_cfg_almost_empty_int=1,
.layer_cfg_almost_full_int=1
.calc_done_int = 1,
.layer_cfg_almost_empty_int = 1,
.layer_cfg_almost_full_int = 1
};
if(task->remain_layers_length == 0)
if (task->remain_layers_length == 0)
{
return 0;
}
if(task->remain_layers_length <= layer_burst_size)
if (task->remain_layers_length <= layer_burst_size)
{
for(uint32_t i=0; i<task->remain_layers_length; i++)
for (uint32_t i = 0; i < task->remain_layers_length; i++)
{
kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
@ -62,7 +63,7 @@ int kpu_continue(void* _task)
}
else
{
for(uint32_t i=0; i<layer_burst_size; i++)
for (uint32_t i = 0; i < layer_burst_size; i++)
{
kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
@ -88,7 +89,7 @@ static int kpu_run_dma_output(uint32_t dma_ch, void* dst, uint32_t length, plic_
sysctl_dma_select(dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ);
dmac_set_irq(dma_ch, kpu_run_all_done, _task, 1);
dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), (void *)(dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (length+7)/8);
DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (length + 7) / 8);
return 0;
}
@ -99,22 +100,22 @@ static int kpu_run_dma_input_done_push_layers(void* _task)
dmac->channel[task->dma_ch].intclear = 0xFFFFFFFF;
kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t)
{
.fifo_full_threshold = 10, .fifo_empty_threshold=1
.fifo_full_threshold = 10, .fifo_empty_threshold = 1
};
kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t)
{
.eight_bit_mode=task->eight_bit_mode
.eight_bit_mode = task->eight_bit_mode
};
kpu_layer_argument_t* last_layer = &task->layers[task->layers_length-1];
kpu_layer_argument_t* last_layer = &task->layers[task->layers_length - 1];
kpu_run_dma_output(task->dma_ch, task->dst, last_layer->dma_parameter.data.dma_total_byte+1, kpu_run_all_done, task);
kpu_run_dma_output(task->dma_ch, task->dst, last_layer->dma_parameter.data.dma_total_byte + 1, kpu_run_all_done, task);
kpu->interrupt_mask.data = (kpu_config_interrupt_t)
{
.calc_done_int=0,
.layer_cfg_almost_empty_int=0,
.layer_cfg_almost_full_int=1
.calc_done_int = 0,
.layer_cfg_almost_empty_int = 0,
.layer_cfg_almost_full_int = 1
};
kpu_continue(task);
return 0;
@ -124,7 +125,7 @@ static void kpu_run_dma_input(uint32_t dma_ch, const void* src, plic_irq_callbac
{
kpu_task_t* task = _task;
kpu_layer_argument_t* first_layer = &task->layers[0];
uint64_t input_size = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num+1);
uint64_t input_size = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1);
dmac_set_irq(dma_ch, cb, _task, 1);
dmac_set_single_mode(dma_ch, (void *)src, (void *)(AI_IO_BASE_ADDR), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
@ -132,15 +133,15 @@ static void kpu_run_dma_input(uint32_t dma_ch, const void* src, plic_irq_callbac
int kpu_run(kpu_task_t* v_task, dmac_channel_number_t dma_ch, const void *src, void* dest, plic_irq_callback_t callback)
{
if(atomic_cas(&g_kpu_context.kpu_status, 0, 1))
if (atomic_cas(&g_kpu_context.kpu_status, 0, 1))
return -1;
memcpy((void *)&g_kpu_context.kpu_task, v_task, sizeof(kpu_task_t));
kpu_task_t *task = (kpu_task_t *)&g_kpu_context.kpu_task;
kpu_layer_argument_t* last_layer = &task->layers[task->layers_length-1];
kpu_layer_argument_t* last_layer = &task->layers[task->layers_length - 1];
uint64_t output_size = last_layer->dma_parameter.data.dma_total_byte+1;
uint64_t output_size = last_layer->dma_parameter.data.dma_total_byte + 1;
last_layer->dma_parameter.data.send_data_out = 1;
last_layer->interrupt_enabe.data.int_en = 1;
@ -163,14 +164,137 @@ int kpu_run(kpu_task_t* v_task, dmac_channel_number_t dma_ch, const void *src, v
uint8_t *kpu_get_output_buf(kpu_task_t* task)
{
kpu_layer_argument_t* last_layer = &task->layers[task->layers_length-1];
size_t output_size = ((last_layer->dma_parameter.data.dma_total_byte+1) + 7) / 8 * 8;
kpu_layer_argument_t* last_layer = &task->layers[task->layers_length - 1];
size_t output_size = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8 * 8;
return malloc(output_size);
}
void kpu_release_output_buf(uint8_t *output_buf)
{
if(output_buf != NULL)
if (output_buf != NULL)
free(output_buf);
}
static void kpu_send_layer(const kpu_layer_argument_t *layer)
{
kpu->layer_argument_fifo = layer->interrupt_enabe.reg;
kpu->layer_argument_fifo = layer->image_addr.reg;
kpu->layer_argument_fifo = layer->image_channel_num.reg;
kpu->layer_argument_fifo = layer->image_size.reg;
kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg;
kpu->layer_argument_fifo = layer->kernel_load_cfg.reg;
kpu->layer_argument_fifo = layer->kernel_offset.reg;
kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg;
kpu->layer_argument_fifo = layer->write_back_cfg.reg;
kpu->layer_argument_fifo = layer->conv_value.reg;
kpu->layer_argument_fifo = layer->conv_value2.reg;
kpu->layer_argument_fifo = layer->dma_parameter.reg;
}
void kpu_init(int eight_bit_mode, plic_irq_callback_t callback, void *userdata)
{
kpu->interrupt_clear.reg = 7;
kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t)
{
.fifo_full_threshold = 10, .fifo_empty_threshold = 1
};
kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t)
{
.eight_bit_mode = eight_bit_mode
};
kpu->interrupt_mask.data = (kpu_config_interrupt_t)
{
.calc_done_int = 0,
.layer_cfg_almost_empty_int = 0,
.layer_cfg_almost_full_int = 1
};
plic_irq_enable(IRQN_AI_INTERRUPT);
plic_set_priority(IRQN_AI_INTERRUPT, 1);
plic_irq_register(IRQN_AI_INTERRUPT, callback, userdata);
}
void kpu_input_dma(kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata)
{
uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1);
dmac_set_irq(dma_ch, callback, userdata, 1);
dmac_set_single_mode(dma_ch, (void *)src, (void *)(AI_IO_BASE_ADDR), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
}
void kpu_conv2d(kpu_layer_argument_t *layer, int stride)
{
kpu->interrupt_clear.data = (kpu_config_interrupt_t)
{
.calc_done_int = 1,
.layer_cfg_almost_empty_int = 1,
.layer_cfg_almost_full_int = 1
};
if (stride == 2)
layer->kernel_pool_type_cfg.data.pool_type = 5;
kpu_send_layer(layer);
}
void kpu_conv2d_output(kpu_layer_argument_t *layer, int stride, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata)
{
kpu->interrupt_mask.data = (kpu_config_interrupt_t)
{
.calc_done_int = 1,
.layer_cfg_almost_empty_int = 1,
.layer_cfg_almost_full_int = 1
};
layer->dma_parameter.data.send_data_out = 1;
sysctl_dma_select(dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ);
dmac_set_irq(dma_ch, callback, userdata, 1);
dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer->dma_parameter.data.dma_total_byte + 8) / 8);
kpu_conv2d(layer, stride);
}
extern long k_add(const uint8_t *src1, long src1_s, const uint8_t *src2, long src2_b, uint8_t *dest, long dest_s, long bias, size_t len);
void kpu_add(const uint8_t *src1, const quantize_param_t *src1_param, const uint8_t *src2, const quantize_param_t *src2_param, int width, int height, int channels, uint8_t *dest, const quantize_param_t *dest_param)
{
quantize_param_t q1 = *src1_param, q2 = *src2_param, q3 = *dest_param;
size_t oc, y ,x;
uint32_t row_padding;
uint32_t row_group;
uint32_t row_length;
if (width <= 16)
{
row_padding = 16;
row_group = 4;
row_length = 1;
}
else if (width <= 32)
{
row_padding = 32;
row_group = 2;
row_length = 1;
}
else
{
row_padding = 64;
row_group = 1;
row_length = (width + 63) / 64;
}
for (oc = 0; oc < channels; oc++)
{
uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
for (y = 0; y < height; y++)
{
uint8_t *y_origin = channel_origin + y * row_length * 64;
for (x = 0; x < width; x++)
{
int value = ((*src1++ * q1.scale + q1.bias + *src2++ * q2.scale + q2.bias) - q3.bias) / q3.scale;
if (value < 0) value = 0;
if (value > 0xFF) value = 0xFF;
y_origin[x] = value;
}
}
}
}