From a95da9ee1d76702d0e5f43f24339b964937a30e9 Mon Sep 17 00:00:00 2001 From: Jorge Rodriguez Date: Sat, 26 Jul 2014 12:04:39 -0700 Subject: [PATCH] Unroll the multiply-add loops. At the cost of a function pointer dereference we get a whole lot of conditionals eliminated. Should be a solid win once the debug asserts are gone. --- stb_resample.h | 114 +++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 90 insertions(+), 24 deletions(-) diff --git a/stb_resample.h b/stb_resample.h index b204365..f706b52 100644 --- a/stb_resample.h +++ b/stb_resample.h @@ -490,9 +490,84 @@ static float* stbr__add_empty_ring_buffer_entry(stbr__info* stbr_info, int n) return ring_buffer; } +typedef void(*stbr__output_decode_coefficients)(float* output_buffer, int out_texel_index, float* decode_buffer, int decode_texel_index, int channels, float coefficient); + +static void stbr__output_decode_coefficients_1(float* output_buffer, int out_texel_index, float* input_buffer, int input_texel_index, int channels, float coefficient) +{ + STBR_DEBUG_ASSERT(channels == 1); + + output_buffer[out_texel_index] += input_buffer[input_texel_index] * coefficient; + + STBR_DEBUG_ASSERT(output_buffer[out_texel_index] <= 1.0f); +} + +static void stbr__output_decode_coefficients_2(float* output_buffer, int out_texel_index, float* input_buffer, int input_texel_index, int channels, float coefficient) +{ + STBR_DEBUG_ASSERT(channels == 2); + + output_buffer[out_texel_index ] += input_buffer[input_texel_index ] * coefficient; + output_buffer[out_texel_index + 1] += input_buffer[input_texel_index + 1] * coefficient; + + STBR_DEBUG_ASSERT(output_buffer[out_texel_index ] <= 1.0f); + STBR_DEBUG_ASSERT(output_buffer[out_texel_index+1] <= 1.0f); +} + +static void stbr__output_decode_coefficients_3(float* output_buffer, int out_texel_index, float* input_buffer, int input_texel_index, int channels, float coefficient) +{ + STBR_DEBUG_ASSERT(channels == 3); + + output_buffer[out_texel_index ] += input_buffer[input_texel_index ] * coefficient; + output_buffer[out_texel_index + 1] += input_buffer[input_texel_index + 1] * coefficient; + output_buffer[out_texel_index + 2] += input_buffer[input_texel_index + 2] * coefficient; + + STBR_DEBUG_ASSERT(output_buffer[out_texel_index ] <= 1.0f); + STBR_DEBUG_ASSERT(output_buffer[out_texel_index + 1] <= 1.0f); + STBR_DEBUG_ASSERT(output_buffer[out_texel_index + 2] <= 1.0f); +} + +static void stbr__output_decode_coefficients_4(float* output_buffer, int out_texel_index, float* input_buffer, int input_texel_index, int channels, float coefficient) +{ + STBR_DEBUG_ASSERT(channels == 4); + + output_buffer[out_texel_index ] += input_buffer[input_texel_index ] * coefficient; + output_buffer[out_texel_index + 1] += input_buffer[input_texel_index + 1] * coefficient; + output_buffer[out_texel_index + 2] += input_buffer[input_texel_index + 2] * coefficient; + output_buffer[out_texel_index + 3] += input_buffer[input_texel_index + 3] * coefficient; + + STBR_DEBUG_ASSERT(output_buffer[out_texel_index ] <= 1.0f); + STBR_DEBUG_ASSERT(output_buffer[out_texel_index + 1] <= 1.0f); + STBR_DEBUG_ASSERT(output_buffer[out_texel_index + 2] <= 1.0f); + STBR_DEBUG_ASSERT(output_buffer[out_texel_index + 3] <= 1.0f); +} + +static void stbr__output_decode_coefficients_n(float* output_buffer, int out_texel_index, float* input_buffer, int input_texel_index, int channels, float coefficient) +{ + int c; + for (c = 0; c < channels; c++) + { + output_buffer[out_texel_index + c] += input_buffer[input_texel_index + c] * coefficient; + + STBR_DEBUG_ASSERT(output_buffer[out_texel_index + c] <= 1.0f); + } +} + +static stbr__output_decode_coefficients stbr__get_output_decode_coefficients_function(int channels) +{ + if (channels == 1) + return &stbr__output_decode_coefficients_1; + else if (channels == 2) + return &stbr__output_decode_coefficients_2; + else if (channels == 3) + return &stbr__output_decode_coefficients_3; + else if (channels == 4) + return &stbr__output_decode_coefficients_4; + + return &stbr__output_decode_coefficients_n; +} + static void stbr__resample_horizontal_upsample(stbr__info* stbr_info, int n, float* output_buffer) { - int x, k, c; + int x, k; int output_w = stbr_info->output_w; int kernel_texel_width = stbr__get_filter_texel_width(stbr_info->filter); int channels = stbr_info->channels; @@ -500,6 +575,8 @@ static void stbr__resample_horizontal_upsample(stbr__info* stbr_info, int n, flo stbr__contributors* horizontal_contributors = stbr_info->horizontal_contributors; float* horizontal_coefficients = stbr_info->horizontal_coefficients; + stbr__output_decode_coefficients output_decode_coefficients_fn = stbr__get_output_decode_coefficients_function(channels); + for (x = 0; x < output_w; x++) { int n0 = horizontal_contributors[x].n0; @@ -521,19 +598,14 @@ static void stbr__resample_horizontal_upsample(stbr__info* stbr_info, int n, flo int in_texel_index = k * channels; float coefficient = horizontal_coefficients[coefficient_index]; - for (c = 0; c < channels; c++) - { - output_buffer[out_texel_index + c] += decode_buffer[in_texel_index + c] * coefficient; - - STBR_DEBUG_ASSERT(output_buffer[out_texel_index + c] <= 1.0f); - } + output_decode_coefficients_fn(output_buffer, out_texel_index, decode_buffer, in_texel_index, channels, coefficient); } } } static void stbr__resample_horizontal_downsample(stbr__info* stbr_info, int n, float* output_buffer) { - int x, k, c; + int x, k; int input_w = stbr_info->input_w; int output_w = stbr_info->output_w; int kernel_texel_width = stbr__get_filter_texel_width(stbr_info->filter); @@ -544,6 +616,8 @@ static void stbr__resample_horizontal_downsample(stbr__info* stbr_info, int n, f int filter_texel_margin = stbr__get_filter_texel_margin(stbr_info->filter); int max_x = input_w + filter_texel_margin * 2; + stbr__output_decode_coefficients output_decode_coefficients_fn = stbr__get_output_decode_coefficients_function(channels); + STBR_DEBUG_ASSERT(!stbr__use_width_upsampling(stbr_info)); for (x = 0; x < max_x; x++) @@ -565,12 +639,7 @@ static void stbr__resample_horizontal_downsample(stbr__info* stbr_info, int n, f int out_texel_index = k * channels; float coefficient = horizontal_coefficients[coefficient_index]; - for (c = 0; c < channels; c++) - { - output_buffer[out_texel_index + c] += decode_buffer[in_texel_index + c] * coefficient; - - STBR_DEBUG_ASSERT(output_buffer[out_texel_index + c] <= 1.0); // This would indicate that the sum of kernels for this texel doesn't add to 1. - } + output_decode_coefficients_fn(output_buffer, out_texel_index, decode_buffer, in_texel_index, channels, coefficient); } } } @@ -638,6 +707,8 @@ static void stbr__resample_vertical_upsample(stbr__info* stbr_info, int n, int i int output_row_index = n * stbr_info->output_stride_bytes; + stbr__output_decode_coefficients output_decode_coefficients_fn = stbr__get_output_decode_coefficients_function(channels); + STBR_DEBUG_ASSERT(stbr__use_height_upsampling(stbr_info)); STBR_DEBUG_ASSERT(n0 >= in_first_scanline); STBR_DEBUG_ASSERT(n1 <= in_last_scanline); @@ -658,8 +729,7 @@ static void stbr__resample_vertical_upsample(stbr__info* stbr_info, int n, int i float* ring_buffer_entry = stbr__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, kernel_texel_width, ring_buffer_length); float coefficient = vertical_coefficients[coefficient_index]; - for (c = 0; c < channels; c++) - encode_buffer[c] += ring_buffer_entry[in_texel_index + c] * coefficient; + output_decode_coefficients_fn(encode_buffer, 0, ring_buffer_entry, in_texel_index, channels, coefficient); } for (c = 0; c < channels; c++) @@ -669,7 +739,7 @@ static void stbr__resample_vertical_upsample(stbr__info* stbr_info, int n, int i static void stbr__resample_vertical_downsample(stbr__info* stbr_info, int n, int in_first_scanline, int in_last_scanline, float in_center_of_out) { - int x, k, c; + int x, k; int output_w = stbr_info->output_w; int output_h = stbr_info->output_h; stbr__contributors* vertical_contributors = &stbr_info->vertical_contributors; @@ -690,6 +760,8 @@ static void stbr__resample_vertical_downsample(stbr__info* stbr_info, int n, int int n0 = vertical_contributors->n0; int n1 = vertical_contributors->n1; + stbr__output_decode_coefficients output_decode_coefficients_fn = stbr__get_output_decode_coefficients_function(channels); + STBR_DEBUG_ASSERT(!stbr__use_height_upsampling(stbr_info)); STBR_DEBUG_ASSERT(n0 >= in_first_scanline); STBR_DEBUG_ASSERT(n1 <= in_last_scanline); @@ -708,13 +780,7 @@ static void stbr__resample_vertical_downsample(stbr__info* stbr_info, int n, int float* ring_buffer_entry = stbr__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, kernel_texel_width, ring_buffer_length); float coefficient = vertical_coefficients[coefficient_index]; - for (c = 0; c < channels; c++) - { - int index = in_texel_index + c; - ring_buffer_entry[index] += horizontal_buffer[index] * coefficient; - - STBR_DEBUG_ASSERT(ring_buffer_entry[index] <= 1.0); // This would indicate that the sum of kernels for this texel doesn't add to 1. - } + output_decode_coefficients_fn(ring_buffer_entry, in_texel_index, horizontal_buffer, in_texel_index, channels, coefficient); } } }