Add kpu functions
parent
8f7723a8b9
commit
655c77623d
|
@ -15,11 +15,12 @@ add_compile_flags(LD
|
|||
add_compile_flags(BOTH
|
||||
-mcmodel=medany
|
||||
-fno-common
|
||||
-march=rv64imafdc
|
||||
-ffunction-sections
|
||||
-fdata-sections
|
||||
-fstrict-volatile-bitfields
|
||||
-fno-zero-initialized-in-bss
|
||||
-Os
|
||||
-O3
|
||||
-ggdb
|
||||
)
|
||||
|
||||
|
|
|
@ -365,4 +365,16 @@ uint8_t *kpu_get_output_buf(kpu_task_t* task);
|
|||
*/
|
||||
void kpu_release_output_buf(uint8_t *output_buf);
|
||||
|
||||
typedef struct _quantize_param
|
||||
{
|
||||
float scale;
|
||||
float bias;
|
||||
} quantize_param_t;
|
||||
|
||||
void kpu_init(int eight_bit_mode, plic_irq_callback_t callback, void *userdata);
|
||||
void kpu_input_dma(kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata);
|
||||
void kpu_conv2d(kpu_layer_argument_t *layer, int stride);
|
||||
void kpu_conv2d_output(kpu_layer_argument_t *layer, int stride, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata);
|
||||
void kpu_add(const uint8_t *src1, const quantize_param_t *src1_param, const uint8_t *src2, const quantize_param_t *src2_param, int width, int height, int channels, uint8_t *dest, const quantize_param_t *dest_param);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
#include <sysctl.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include "printf.h"
|
||||
#include "dmac.h"
|
||||
#include <string.h>
|
||||
|
@ -32,18 +33,18 @@ int kpu_continue(void* _task)
|
|||
int layer_burst_size = 1;
|
||||
kpu->interrupt_clear.data = (kpu_config_interrupt_t)
|
||||
{
|
||||
.calc_done_int=1,
|
||||
.layer_cfg_almost_empty_int=1,
|
||||
.layer_cfg_almost_full_int=1
|
||||
.calc_done_int = 1,
|
||||
.layer_cfg_almost_empty_int = 1,
|
||||
.layer_cfg_almost_full_int = 1
|
||||
};
|
||||
|
||||
if(task->remain_layers_length == 0)
|
||||
if (task->remain_layers_length == 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
if(task->remain_layers_length <= layer_burst_size)
|
||||
if (task->remain_layers_length <= layer_burst_size)
|
||||
{
|
||||
for(uint32_t i=0; i<task->remain_layers_length; i++)
|
||||
for (uint32_t i = 0; i < task->remain_layers_length; i++)
|
||||
{
|
||||
kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
|
||||
kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
|
||||
|
@ -62,7 +63,7 @@ int kpu_continue(void* _task)
|
|||
}
|
||||
else
|
||||
{
|
||||
for(uint32_t i=0; i<layer_burst_size; i++)
|
||||
for (uint32_t i = 0; i < layer_burst_size; i++)
|
||||
{
|
||||
kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
|
||||
kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
|
||||
|
@ -88,7 +89,7 @@ static int kpu_run_dma_output(uint32_t dma_ch, void* dst, uint32_t length, plic_
|
|||
sysctl_dma_select(dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ);
|
||||
dmac_set_irq(dma_ch, kpu_run_all_done, _task, 1);
|
||||
dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), (void *)(dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
|
||||
DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (length+7)/8);
|
||||
DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (length + 7) / 8);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -99,22 +100,22 @@ static int kpu_run_dma_input_done_push_layers(void* _task)
|
|||
dmac->channel[task->dma_ch].intclear = 0xFFFFFFFF;
|
||||
kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t)
|
||||
{
|
||||
.fifo_full_threshold = 10, .fifo_empty_threshold=1
|
||||
.fifo_full_threshold = 10, .fifo_empty_threshold = 1
|
||||
};
|
||||
kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t)
|
||||
{
|
||||
.eight_bit_mode=task->eight_bit_mode
|
||||
.eight_bit_mode = task->eight_bit_mode
|
||||
};
|
||||
|
||||
kpu_layer_argument_t* last_layer = &task->layers[task->layers_length-1];
|
||||
kpu_layer_argument_t* last_layer = &task->layers[task->layers_length - 1];
|
||||
|
||||
kpu_run_dma_output(task->dma_ch, task->dst, last_layer->dma_parameter.data.dma_total_byte+1, kpu_run_all_done, task);
|
||||
kpu_run_dma_output(task->dma_ch, task->dst, last_layer->dma_parameter.data.dma_total_byte + 1, kpu_run_all_done, task);
|
||||
|
||||
kpu->interrupt_mask.data = (kpu_config_interrupt_t)
|
||||
{
|
||||
.calc_done_int=0,
|
||||
.layer_cfg_almost_empty_int=0,
|
||||
.layer_cfg_almost_full_int=1
|
||||
.calc_done_int = 0,
|
||||
.layer_cfg_almost_empty_int = 0,
|
||||
.layer_cfg_almost_full_int = 1
|
||||
};
|
||||
kpu_continue(task);
|
||||
return 0;
|
||||
|
@ -124,7 +125,7 @@ static void kpu_run_dma_input(uint32_t dma_ch, const void* src, plic_irq_callbac
|
|||
{
|
||||
kpu_task_t* task = _task;
|
||||
kpu_layer_argument_t* first_layer = &task->layers[0];
|
||||
uint64_t input_size = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num+1);
|
||||
uint64_t input_size = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1);
|
||||
dmac_set_irq(dma_ch, cb, _task, 1);
|
||||
dmac_set_single_mode(dma_ch, (void *)src, (void *)(AI_IO_BASE_ADDR), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
|
||||
DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
|
||||
|
@ -132,15 +133,15 @@ static void kpu_run_dma_input(uint32_t dma_ch, const void* src, plic_irq_callbac
|
|||
|
||||
int kpu_run(kpu_task_t* v_task, dmac_channel_number_t dma_ch, const void *src, void* dest, plic_irq_callback_t callback)
|
||||
{
|
||||
if(atomic_cas(&g_kpu_context.kpu_status, 0, 1))
|
||||
if (atomic_cas(&g_kpu_context.kpu_status, 0, 1))
|
||||
return -1;
|
||||
|
||||
memcpy((void *)&g_kpu_context.kpu_task, v_task, sizeof(kpu_task_t));
|
||||
kpu_task_t *task = (kpu_task_t *)&g_kpu_context.kpu_task;
|
||||
|
||||
kpu_layer_argument_t* last_layer = &task->layers[task->layers_length-1];
|
||||
kpu_layer_argument_t* last_layer = &task->layers[task->layers_length - 1];
|
||||
|
||||
uint64_t output_size = last_layer->dma_parameter.data.dma_total_byte+1;
|
||||
uint64_t output_size = last_layer->dma_parameter.data.dma_total_byte + 1;
|
||||
|
||||
last_layer->dma_parameter.data.send_data_out = 1;
|
||||
last_layer->interrupt_enabe.data.int_en = 1;
|
||||
|
@ -163,14 +164,137 @@ int kpu_run(kpu_task_t* v_task, dmac_channel_number_t dma_ch, const void *src, v
|
|||
|
||||
uint8_t *kpu_get_output_buf(kpu_task_t* task)
|
||||
{
|
||||
kpu_layer_argument_t* last_layer = &task->layers[task->layers_length-1];
|
||||
size_t output_size = ((last_layer->dma_parameter.data.dma_total_byte+1) + 7) / 8 * 8;
|
||||
kpu_layer_argument_t* last_layer = &task->layers[task->layers_length - 1];
|
||||
size_t output_size = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8 * 8;
|
||||
return malloc(output_size);
|
||||
}
|
||||
|
||||
void kpu_release_output_buf(uint8_t *output_buf)
|
||||
{
|
||||
if(output_buf != NULL)
|
||||
if (output_buf != NULL)
|
||||
free(output_buf);
|
||||
}
|
||||
|
||||
static void kpu_send_layer(const kpu_layer_argument_t *layer)
|
||||
{
|
||||
kpu->layer_argument_fifo = layer->interrupt_enabe.reg;
|
||||
kpu->layer_argument_fifo = layer->image_addr.reg;
|
||||
kpu->layer_argument_fifo = layer->image_channel_num.reg;
|
||||
kpu->layer_argument_fifo = layer->image_size.reg;
|
||||
kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg;
|
||||
kpu->layer_argument_fifo = layer->kernel_load_cfg.reg;
|
||||
kpu->layer_argument_fifo = layer->kernel_offset.reg;
|
||||
kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg;
|
||||
kpu->layer_argument_fifo = layer->write_back_cfg.reg;
|
||||
kpu->layer_argument_fifo = layer->conv_value.reg;
|
||||
kpu->layer_argument_fifo = layer->conv_value2.reg;
|
||||
kpu->layer_argument_fifo = layer->dma_parameter.reg;
|
||||
}
|
||||
|
||||
void kpu_init(int eight_bit_mode, plic_irq_callback_t callback, void *userdata)
|
||||
{
|
||||
kpu->interrupt_clear.reg = 7;
|
||||
kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t)
|
||||
{
|
||||
.fifo_full_threshold = 10, .fifo_empty_threshold = 1
|
||||
};
|
||||
kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t)
|
||||
{
|
||||
.eight_bit_mode = eight_bit_mode
|
||||
};
|
||||
kpu->interrupt_mask.data = (kpu_config_interrupt_t)
|
||||
{
|
||||
.calc_done_int = 0,
|
||||
.layer_cfg_almost_empty_int = 0,
|
||||
.layer_cfg_almost_full_int = 1
|
||||
};
|
||||
|
||||
plic_irq_enable(IRQN_AI_INTERRUPT);
|
||||
plic_set_priority(IRQN_AI_INTERRUPT, 1);
|
||||
plic_irq_register(IRQN_AI_INTERRUPT, callback, userdata);
|
||||
}
|
||||
|
||||
void kpu_input_dma(kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata)
|
||||
{
|
||||
uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1);
|
||||
dmac_set_irq(dma_ch, callback, userdata, 1);
|
||||
dmac_set_single_mode(dma_ch, (void *)src, (void *)(AI_IO_BASE_ADDR), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
|
||||
DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
|
||||
}
|
||||
|
||||
void kpu_conv2d(kpu_layer_argument_t *layer, int stride)
|
||||
{
|
||||
kpu->interrupt_clear.data = (kpu_config_interrupt_t)
|
||||
{
|
||||
.calc_done_int = 1,
|
||||
.layer_cfg_almost_empty_int = 1,
|
||||
.layer_cfg_almost_full_int = 1
|
||||
};
|
||||
|
||||
if (stride == 2)
|
||||
layer->kernel_pool_type_cfg.data.pool_type = 5;
|
||||
kpu_send_layer(layer);
|
||||
}
|
||||
|
||||
void kpu_conv2d_output(kpu_layer_argument_t *layer, int stride, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata)
|
||||
{
|
||||
kpu->interrupt_mask.data = (kpu_config_interrupt_t)
|
||||
{
|
||||
.calc_done_int = 1,
|
||||
.layer_cfg_almost_empty_int = 1,
|
||||
.layer_cfg_almost_full_int = 1
|
||||
};
|
||||
layer->dma_parameter.data.send_data_out = 1;
|
||||
sysctl_dma_select(dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ);
|
||||
dmac_set_irq(dma_ch, callback, userdata, 1);
|
||||
dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
|
||||
DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer->dma_parameter.data.dma_total_byte + 8) / 8);
|
||||
kpu_conv2d(layer, stride);
|
||||
}
|
||||
|
||||
extern long k_add(const uint8_t *src1, long src1_s, const uint8_t *src2, long src2_b, uint8_t *dest, long dest_s, long bias, size_t len);
|
||||
|
||||
void kpu_add(const uint8_t *src1, const quantize_param_t *src1_param, const uint8_t *src2, const quantize_param_t *src2_param, int width, int height, int channels, uint8_t *dest, const quantize_param_t *dest_param)
|
||||
{
|
||||
quantize_param_t q1 = *src1_param, q2 = *src2_param, q3 = *dest_param;
|
||||
size_t oc, y ,x;
|
||||
|
||||
uint32_t row_padding;
|
||||
uint32_t row_group;
|
||||
uint32_t row_length;
|
||||
|
||||
if (width <= 16)
|
||||
{
|
||||
row_padding = 16;
|
||||
row_group = 4;
|
||||
row_length = 1;
|
||||
}
|
||||
else if (width <= 32)
|
||||
{
|
||||
row_padding = 32;
|
||||
row_group = 2;
|
||||
row_length = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
row_padding = 64;
|
||||
row_group = 1;
|
||||
row_length = (width + 63) / 64;
|
||||
}
|
||||
|
||||
for (oc = 0; oc < channels; oc++)
|
||||
{
|
||||
uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
|
||||
for (y = 0; y < height; y++)
|
||||
{
|
||||
uint8_t *y_origin = channel_origin + y * row_length * 64;
|
||||
for (x = 0; x < width; x++)
|
||||
{
|
||||
int value = ((*src1++ * q1.scale + q1.bias + *src2++ * q2.scale + q2.bias) - q3.bias) / q3.scale;
|
||||
if (value < 0) value = 0;
|
||||
if (value > 0xFF) value = 0xFF;
|
||||
y_origin[x] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue