From 37a848135b2b5df6177a6fd42e1d60eb0fa9539d Mon Sep 17 00:00:00 2001 From: Sergey Silkin Date: Mon, 4 May 2026 16:26:14 +0000 Subject: [PATCH] Fix rounding in scaling routines. * before: https://screenshot.googleplex.com/3ujxU7drx8J9aVv * after: https://screenshot.googleplex.com/5twPmxuBUKjvVD9 * source: https://screenshot.googleplex.com/9yevVP7URe3XfSm Bug: b/465721312 Change-Id: I2aede005db252b2912ceef23379463f176675205 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7813417 Reviewed-by: Frank Barchard Reviewed-by: richard winterton --- source/scale.cc | 16 ++-- source/scale_common.cc | 143 ++++++++++++++++++------------- source/scale_gcc.cc | 8 ++ source/scale_win.cc | 12 ++- unit_test/scale_plane_test.cc | 157 ++++++++++++++++++++++++++++++++++ 5 files changed, 268 insertions(+), 68 deletions(-) diff --git a/source/scale.cc b/source/scale.cc index 9c1e9b264..5481a7e6e 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -815,8 +815,9 @@ static void ScaleAddCols2_C(int dst_width, boxwidth = MIN1((x >> 16) - ix); int scaletbl_index = boxwidth - minboxwidth; assert((scaletbl_index == 0) || (scaletbl_index == 1)); - *dst_ptr++ = (uint8_t)(SumPixels(boxwidth, src_ptr + ix) * - scaletbl[scaletbl_index] >> + *dst_ptr++ = (uint8_t)((SumPixels(boxwidth, src_ptr + ix) * + scaletbl[scaletbl_index] + + 32768) >> 16); } } @@ -840,7 +841,9 @@ static void ScaleAddCols2_16_C(int dst_width, int scaletbl_index = boxwidth - minboxwidth; assert((scaletbl_index == 0) || (scaletbl_index == 1)); *dst_ptr++ = - SumPixels_16(boxwidth, src_ptr + ix) * scaletbl[scaletbl_index] >> 16; + (SumPixels_16(boxwidth, src_ptr + ix) * scaletbl[scaletbl_index] + + 32768) >> + 16; } } @@ -855,7 +858,7 @@ static void ScaleAddCols0_C(int dst_width, (void)dx; src_ptr += (x >> 16); for (i = 0; i < dst_width; ++i) { - *dst_ptr++ = (uint8_t)(src_ptr[i] * scaleval >> 16); + *dst_ptr++ = (uint8_t)((src_ptr[i] * scaleval + 32768) >> 16); } } @@ -870,7 +873,8 @@ static void ScaleAddCols1_C(int dst_width, int i; x >>= 16; for (i = 0; i < dst_width; ++i) { - *dst_ptr++ = (uint8_t)(SumPixels(boxwidth, src_ptr + x) * scaleval >> 16); + *dst_ptr++ = + (uint8_t)((SumPixels(boxwidth, src_ptr + x) * scaleval + 32768) >> 16); x += boxwidth; } } @@ -885,7 +889,7 @@ static void ScaleAddCols1_16_C(int dst_width, int scaleval = 65536 / (boxwidth * boxheight); int i; for (i = 0; i < dst_width; ++i) { - *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16; + *dst_ptr++ = (SumPixels_16(boxwidth, src_ptr + x) * scaleval + 32768) >> 16; x += boxwidth; } } diff --git a/source/scale_common.cc b/source/scale_common.cc index 537f030aa..c82fd41dd 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -893,23 +893,29 @@ void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr, int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] + - src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) * - (65536 / 9) >> - 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + - src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) * - (65536 / 9) >> - 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] + - src_ptr[src_stride + 7] + src_ptr[src_stride * 2 + 6] + - src_ptr[src_stride * 2 + 7]) * - (65536 / 6) >> - 16; + dst_ptr[0] = + (uint8_t)(((src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + + src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] + + src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) * + (65536 / 9) + + 32768) >> + 16); + dst_ptr[1] = + (uint8_t)(((src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + + src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] + + src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) * + (65536 / 9) + + 32768) >> + 16); + dst_ptr[2] = + (uint8_t)(((src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] + + src_ptr[src_stride + 7] + src_ptr[src_stride * 2 + 6] + + src_ptr[src_stride * 2 + 7]) * + (65536 / 6) + + 32768) >> + 16); src_ptr += 8; dst_ptr += 3; } @@ -922,23 +928,31 @@ void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr, int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] + - src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) * - (65536u / 9u) >> - 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + - src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) * - (65536u / 9u) >> - 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] + - src_ptr[src_stride + 7] + src_ptr[src_stride * 2 + 6] + - src_ptr[src_stride * 2 + 7]) * - (65536u / 6u) >> - 16; + dst_ptr[0] = + (uint16_t)(((src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + + src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] + + src_ptr[src_stride * 2 + 1] + + src_ptr[src_stride * 2 + 2]) * + (65536u / 9u) + + 32768u) >> + 16); + dst_ptr[1] = + (uint16_t)(((src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + + src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] + + src_ptr[src_stride * 2 + 4] + + src_ptr[src_stride * 2 + 5]) * + (65536u / 9u) + + 32768u) >> + 16); + dst_ptr[2] = + (uint16_t)(((src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] + + src_ptr[src_stride + 7] + src_ptr[src_stride * 2 + 6] + + src_ptr[src_stride * 2 + 7]) * + (65536u / 6u) + + 32768u) >> + 16); src_ptr += 8; dst_ptr += 3; } @@ -952,20 +966,23 @@ void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr, int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = - (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[src_stride + 0] + - src_ptr[src_stride + 1] + src_ptr[src_stride + 2]) * - (65536 / 6) >> - 16; - dst_ptr[1] = - (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[src_stride + 3] + - src_ptr[src_stride + 4] + src_ptr[src_stride + 5]) * - (65536 / 6) >> - 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] + - src_ptr[src_stride + 7]) * - (65536 / 4) >> - 16; + dst_ptr[0] = (uint8_t)(((src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + + src_ptr[src_stride + 2]) * + (65536 / 6) + + 32768) >> + 16); + dst_ptr[1] = (uint8_t)(((src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + + src_ptr[src_stride + 5]) * + (65536 / 6) + + 32768) >> + 16); + dst_ptr[2] = (uint8_t)(((src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] + + src_ptr[src_stride + 7]) * + (65536 / 4) + + 32768) >> + 16); src_ptr += 8; dst_ptr += 3; } @@ -979,19 +996,25 @@ void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr, assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { dst_ptr[0] = - (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[src_stride + 0] + - src_ptr[src_stride + 1] + src_ptr[src_stride + 2]) * - (65536u / 6u) >> - 16; + (uint16_t)(((src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + + src_ptr[src_stride + 2]) * + (65536u / 6u) + + 32768u) >> + 16); dst_ptr[1] = - (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[src_stride + 3] + - src_ptr[src_stride + 4] + src_ptr[src_stride + 5]) * - (65536u / 6u) >> - 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] + - src_ptr[src_stride + 7]) * - (65536u / 4u) >> - 16; + (uint16_t)(((src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + + src_ptr[src_stride + 5]) * + (65536u / 6u) + + 32768u) >> + 16); + dst_ptr[2] = + (uint16_t)(((src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] + + src_ptr[src_stride + 7]) * + (65536u / 4u) + + 32768u) >> + 16); src_ptr += 8; dst_ptr += 3; } diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index 6a2524230..b313e854d 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -693,7 +693,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, "paddusw %%xmm6,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" "paddusw %%xmm0,%%xmm1 \n" + "movdqa %%xmm1,%%xmm6 \n" "pmulhuw %%xmm5,%%xmm1 \n" + "pmullw %%xmm5,%%xmm6 \n" + "psraw $15,%%xmm6 \n" + "psubw %%xmm6,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" "movd %%xmm1,(%1) \n" "psrlq $0x10,%%xmm1 \n" @@ -754,7 +758,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, "paddusw %%xmm1,%%xmm7 \n" "pshufb %%xmm3,%%xmm7 \n" "paddusw %%xmm7,%%xmm6 \n" + "movdqa %%xmm6,%%xmm7 \n" "pmulhuw %%xmm4,%%xmm6 \n" + "pmullw %%xmm4,%%xmm7 \n" + "psraw $15,%%xmm7 \n" + "psubw %%xmm7,%%xmm6 \n" "packuswb %%xmm6,%%xmm6 \n" "movd %%xmm6,(%1) \n" "psrlq $0x10,%%xmm6 \n" diff --git a/source/scale_win.cc b/source/scale_win.cc index 32c0506fa..0e47e9615 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -748,7 +748,11 @@ __declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, pshufb xmm7, xmm3 paddusw xmm6, xmm7 - pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 + movdqa xmm7, xmm6 + pmulhuw xmm6, xmm4 + pmullw xmm7, xmm4 + psraw xmm7, 15 + psubw xmm6, xmm7 // divide by 9,9,6, 9,9,6 with round packuswb xmm6, xmm6 movd [edx], xmm6 // write 6 pixels @@ -793,7 +797,11 @@ __declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, pshufb xmm0, xmm4 paddusw xmm1, xmm0 - pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 + movdqa xmm6, xmm1 + pmulhuw xmm1, xmm5 + pmullw xmm6, xmm5 + psraw xmm6, 15 + psubw xmm1, xmm6 // divide by 3,3,2, 3,3,2 with round packuswb xmm1, xmm1 movd [edx], xmm1 // write 6 pixels diff --git a/unit_test/scale_plane_test.cc b/unit_test/scale_plane_test.cc index 979c70aad..c21b831ec 100644 --- a/unit_test/scale_plane_test.cc +++ b/unit_test/scale_plane_test.cc @@ -15,6 +15,7 @@ #include #include +#include #include "../unit_test/unit_test.h" #include "libyuv/cpu_id.h" @@ -42,6 +43,11 @@ namespace libyuv { +namespace { +using ::testing::Combine; +using ::testing::Values; +} // namespace + #ifdef ENABLE_ROW_TESTS #ifdef HAS_SCALEROWDOWN2_SSSE3 TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) { @@ -533,5 +539,156 @@ TEST_F(LibYUVScaleTest, ScalePlaneVertical_IntStrideOverflow) { delete[] src; delete[] dst; } +struct ScaleTestParams { + std::vector> src_data; + std::vector> dst_data; + FilterMode filter_mode; +}; + +class ScalePlaneTest + : public LibYUVScaleTest, + public ::testing::WithParamInterface> { +}; + +TEST_P(ScalePlaneTest, Mismatch) { + const ScaleTestParams& params = std::get<0>(GetParam()); + bool use_c_only = std::get<1>(GetParam()); + const int src_height = params.src_data.size(); + const int src_width = params.src_data[0].size(); + const int dst_height = params.dst_data.size(); + const int dst_width = params.dst_data[0].size(); + const int size = src_width * src_height; + + align_buffer_page_end(src_pixels, size); + align_buffer_page_end(dst_pixels, dst_width * dst_height); + + for (int y = 0; y < src_height; ++y) { + for (int x = 0; x < src_width; ++x) { + src_pixels[y * src_width + x] = + static_cast(params.src_data[y][x]); + } + } + memset(dst_pixels, 0, dst_width * dst_height); + + MaskCpuFlags(use_c_only ? disable_cpu_flags_ : benchmark_cpu_info_); + + ScalePlane(src_pixels, src_width, src_width, src_height, dst_pixels, + dst_width, dst_width, dst_height, params.filter_mode); + + bool has_mismatch = false; + for (int y = 0; y < dst_height; ++y) { + for (int x = 0; x < dst_width; ++x) { + if (dst_pixels[y * dst_width + x] != + static_cast(params.dst_data[y][x])) { + has_mismatch = true; + break; + } + } + if (has_mismatch) + break; + } + EXPECT_FALSE(has_mismatch); + + free_aligned_buffer_page_end(dst_pixels); + free_aligned_buffer_page_end(src_pixels); +} + +TEST_P(ScalePlaneTest, Mismatch16) { + const ScaleTestParams& params = std::get<0>(GetParam()); + bool use_c_only = std::get<1>(GetParam()); + const int src_height = params.src_data.size(); + const int src_width = params.src_data[0].size(); + const int dst_height = params.dst_data.size(); + const int dst_width = params.dst_data[0].size(); + const int size = src_width * src_height; + + align_buffer_page_end(src_pixels, size * 2); + align_buffer_page_end(dst_pixels, dst_width * dst_height * 2); + + uint16_t* src_ptr = reinterpret_cast(src_pixels); + uint16_t* dst_ptr = reinterpret_cast(dst_pixels); + + for (int y = 0; y < src_height; ++y) { + for (int x = 0; x < src_width; ++x) { + src_ptr[y * src_width + x] = static_cast(params.src_data[y][x]); + } + } + memset(dst_ptr, 0, dst_width * dst_height * 2); + + MaskCpuFlags(use_c_only ? disable_cpu_flags_ : benchmark_cpu_info_); + + ScalePlane_16(src_ptr, src_width, src_width, src_height, dst_ptr, dst_width, + dst_width, dst_height, params.filter_mode); + + bool has_mismatch = false; + for (int y = 0; y < dst_height; ++y) { + for (int x = 0; x < dst_width; ++x) { + if (dst_ptr[y * dst_width + x] != + static_cast(params.dst_data[y][x])) { + has_mismatch = true; + break; + } + } + if (has_mismatch) + break; + } + EXPECT_FALSE(has_mismatch); + + free_aligned_buffer_page_end(dst_pixels); + free_aligned_buffer_page_end(src_pixels); +} + +INSTANTIATE_TEST_SUITE_P( + All, + ScalePlaneTest, + Combine( + Values( + // ScaleRowDown2Box + ScaleTestParams{.src_data = {{1, 3, 1, 3, 1, 3, 1, 3}, + {1, 3, 1, 3, 1, 3, 1, 3}, + {1, 3, 1, 3, 1, 3, 1, 3}, + {1, 3, 1, 3, 1, 3, 1, 3}, + {1, 3, 1, 3, 1, 3, 1, 3}, + {1, 3, 1, 3, 1, 3, 1, 3}, + {1, 3, 1, 3, 1, 3, 1, 3}, + {1, 3, 1, 3, 1, 3, 1, 3}}, + .dst_data = {{2, 2, 2, 2}, + {2, 2, 2, 2}, + {2, 2, 2, 2}, + {2, 2, 2, 2}}, + .filter_mode = kFilterBox}, + // ScaleRowDown4Box + ScaleTestParams{.src_data = {{1, 3, 1, 3, 1, 3, 1, 3}, + {1, 3, 1, 3, 1, 3, 1, 3}, + {1, 3, 1, 3, 1, 3, 1, 3}, + {1, 3, 1, 3, 1, 3, 1, 3}, + {1, 3, 1, 3, 1, 3, 1, 3}, + {1, 3, 1, 3, 1, 3, 1, 3}, + {1, 3, 1, 3, 1, 3, 1, 3}, + {1, 3, 1, 3, 1, 3, 1, 3}}, + .dst_data = {{2, 2}, {2, 2}}, + .filter_mode = kFilterBox}, + // ScaleRowDown38Box + ScaleTestParams{ + .src_data = + std::vector>(8, std::vector(8, 1)), + .dst_data = + std::vector>(3, std::vector(3, 1)), + .filter_mode = kFilterBox}, + // ScaleAddCols1_C (1/3) + ScaleTestParams{ + .src_data = + std::vector>(6, std::vector(6, 1)), + .dst_data = + std::vector>(2, std::vector(2, 1)), + .filter_mode = kFilterBox}, + // ScaleAddCols2_C (5/12) + ScaleTestParams{ + .src_data = + std::vector>(12, std::vector(12, 1)), + .dst_data = + std::vector>(5, std::vector(5, 1)), + .filter_mode = kFilterBox}), + Values(false, true))); } // namespace libyuv