Fix rounding in scaling routines.

* before: https://screenshot.googleplex.com/3ujxU7drx8J9aVv
* after: https://screenshot.googleplex.com/5twPmxuBUKjvVD9
* source: https://screenshot.googleplex.com/9yevVP7URe3XfSm

Bug: b/465721312
Change-Id: I2aede005db252b2912ceef23379463f176675205
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7813417
Reviewed-by: Frank Barchard <fbarchard@google.com>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
Sergey Silkin 2026-05-04 16:26:14 +00:00 committed by Frank Barchard
parent 125f151316
commit 37a848135b
5 changed files with 268 additions and 68 deletions

View File

@ -815,8 +815,9 @@ static void ScaleAddCols2_C(int dst_width,
boxwidth = MIN1((x >> 16) - ix);
int scaletbl_index = boxwidth - minboxwidth;
assert((scaletbl_index == 0) || (scaletbl_index == 1));
*dst_ptr++ = (uint8_t)(SumPixels(boxwidth, src_ptr + ix) *
scaletbl[scaletbl_index] >>
*dst_ptr++ = (uint8_t)((SumPixels(boxwidth, src_ptr + ix) *
scaletbl[scaletbl_index] +
32768) >>
16);
}
}
@ -840,7 +841,9 @@ static void ScaleAddCols2_16_C(int dst_width,
int scaletbl_index = boxwidth - minboxwidth;
assert((scaletbl_index == 0) || (scaletbl_index == 1));
*dst_ptr++ =
SumPixels_16(boxwidth, src_ptr + ix) * scaletbl[scaletbl_index] >> 16;
(SumPixels_16(boxwidth, src_ptr + ix) * scaletbl[scaletbl_index] +
32768) >>
16;
}
}
@ -855,7 +858,7 @@ static void ScaleAddCols0_C(int dst_width,
(void)dx;
src_ptr += (x >> 16);
for (i = 0; i < dst_width; ++i) {
*dst_ptr++ = (uint8_t)(src_ptr[i] * scaleval >> 16);
*dst_ptr++ = (uint8_t)((src_ptr[i] * scaleval + 32768) >> 16);
}
}
@ -870,7 +873,8 @@ static void ScaleAddCols1_C(int dst_width,
int i;
x >>= 16;
for (i = 0; i < dst_width; ++i) {
*dst_ptr++ = (uint8_t)(SumPixels(boxwidth, src_ptr + x) * scaleval >> 16);
*dst_ptr++ =
(uint8_t)((SumPixels(boxwidth, src_ptr + x) * scaleval + 32768) >> 16);
x += boxwidth;
}
}
@ -885,7 +889,7 @@ static void ScaleAddCols1_16_C(int dst_width,
int scaleval = 65536 / (boxwidth * boxheight);
int i;
for (i = 0; i < dst_width; ++i) {
*dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16;
*dst_ptr++ = (SumPixels_16(boxwidth, src_ptr + x) * scaleval + 32768) >> 16;
x += boxwidth;
}
}

View File

@ -893,23 +893,29 @@ void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
(65536 / 9) >>
16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
(65536 / 9) >>
16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] +
src_ptr[src_stride + 7] + src_ptr[src_stride * 2 + 6] +
src_ptr[src_stride * 2 + 7]) *
(65536 / 6) >>
16;
dst_ptr[0] =
(uint8_t)(((src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
(65536 / 9) +
32768) >>
16);
dst_ptr[1] =
(uint8_t)(((src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
(65536 / 9) +
32768) >>
16);
dst_ptr[2] =
(uint8_t)(((src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] +
src_ptr[src_stride + 7] + src_ptr[src_stride * 2 + 6] +
src_ptr[src_stride * 2 + 7]) *
(65536 / 6) +
32768) >>
16);
src_ptr += 8;
dst_ptr += 3;
}
@ -922,23 +928,31 @@ void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
(65536u / 9u) >>
16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
(65536u / 9u) >>
16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] +
src_ptr[src_stride + 7] + src_ptr[src_stride * 2 + 6] +
src_ptr[src_stride * 2 + 7]) *
(65536u / 6u) >>
16;
dst_ptr[0] =
(uint16_t)(((src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
src_ptr[src_stride * 2 + 1] +
src_ptr[src_stride * 2 + 2]) *
(65536u / 9u) +
32768u) >>
16);
dst_ptr[1] =
(uint16_t)(((src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
src_ptr[src_stride * 2 + 4] +
src_ptr[src_stride * 2 + 5]) *
(65536u / 9u) +
32768u) >>
16);
dst_ptr[2] =
(uint16_t)(((src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] +
src_ptr[src_stride + 7] + src_ptr[src_stride * 2 + 6] +
src_ptr[src_stride * 2 + 7]) *
(65536u / 6u) +
32768u) >>
16);
src_ptr += 8;
dst_ptr += 3;
}
@ -952,20 +966,23 @@ void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] =
(src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[src_stride + 0] +
src_ptr[src_stride + 1] + src_ptr[src_stride + 2]) *
(65536 / 6) >>
16;
dst_ptr[1] =
(src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[src_stride + 3] +
src_ptr[src_stride + 4] + src_ptr[src_stride + 5]) *
(65536 / 6) >>
16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] +
src_ptr[src_stride + 7]) *
(65536 / 4) >>
16;
dst_ptr[0] = (uint8_t)(((src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
src_ptr[src_stride + 2]) *
(65536 / 6) +
32768) >>
16);
dst_ptr[1] = (uint8_t)(((src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
src_ptr[src_stride + 5]) *
(65536 / 6) +
32768) >>
16);
dst_ptr[2] = (uint8_t)(((src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] +
src_ptr[src_stride + 7]) *
(65536 / 4) +
32768) >>
16);
src_ptr += 8;
dst_ptr += 3;
}
@ -979,19 +996,25 @@ void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] =
(src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[src_stride + 0] +
src_ptr[src_stride + 1] + src_ptr[src_stride + 2]) *
(65536u / 6u) >>
16;
(uint16_t)(((src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
src_ptr[src_stride + 2]) *
(65536u / 6u) +
32768u) >>
16);
dst_ptr[1] =
(src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[src_stride + 3] +
src_ptr[src_stride + 4] + src_ptr[src_stride + 5]) *
(65536u / 6u) >>
16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] +
src_ptr[src_stride + 7]) *
(65536u / 4u) >>
16;
(uint16_t)(((src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
src_ptr[src_stride + 5]) *
(65536u / 6u) +
32768u) >>
16);
dst_ptr[2] =
(uint16_t)(((src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] +
src_ptr[src_stride + 7]) *
(65536u / 4u) +
32768u) >>
16);
src_ptr += 8;
dst_ptr += 3;
}

View File

@ -693,7 +693,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
"paddusw %%xmm6,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n"
"paddusw %%xmm0,%%xmm1 \n"
"movdqa %%xmm1,%%xmm6 \n"
"pmulhuw %%xmm5,%%xmm1 \n"
"pmullw %%xmm5,%%xmm6 \n"
"psraw $15,%%xmm6 \n"
"psubw %%xmm6,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"movd %%xmm1,(%1) \n"
"psrlq $0x10,%%xmm1 \n"
@ -754,7 +758,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
"paddusw %%xmm1,%%xmm7 \n"
"pshufb %%xmm3,%%xmm7 \n"
"paddusw %%xmm7,%%xmm6 \n"
"movdqa %%xmm6,%%xmm7 \n"
"pmulhuw %%xmm4,%%xmm6 \n"
"pmullw %%xmm4,%%xmm7 \n"
"psraw $15,%%xmm7 \n"
"psubw %%xmm7,%%xmm6 \n"
"packuswb %%xmm6,%%xmm6 \n"
"movd %%xmm6,(%1) \n"
"psrlq $0x10,%%xmm6 \n"

View File

@ -748,7 +748,11 @@ __declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
pshufb xmm7, xmm3
paddusw xmm6, xmm7
pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
movdqa xmm7, xmm6
pmulhuw xmm6, xmm4
pmullw xmm7, xmm4
psraw xmm7, 15
psubw xmm6, xmm7 // divide by 9,9,6, 9,9,6 with round
packuswb xmm6, xmm6
movd [edx], xmm6 // write 6 pixels
@ -793,7 +797,11 @@ __declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
pshufb xmm0, xmm4
paddusw xmm1, xmm0
pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
movdqa xmm6, xmm1
pmulhuw xmm1, xmm5
pmullw xmm6, xmm5
psraw xmm6, 15
psubw xmm1, xmm6 // divide by 3,3,2, 3,3,2 with round
packuswb xmm1, xmm1
movd [edx], xmm1 // write 6 pixels

View File

@ -15,6 +15,7 @@
#include <time.h>
#include <new>
#include <vector>
#include "../unit_test/unit_test.h"
#include "libyuv/cpu_id.h"
@ -42,6 +43,11 @@
namespace libyuv {
namespace {
using ::testing::Combine;
using ::testing::Values;
} // namespace
#ifdef ENABLE_ROW_TESTS
#ifdef HAS_SCALEROWDOWN2_SSSE3
TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) {
@ -533,5 +539,156 @@ TEST_F(LibYUVScaleTest, ScalePlaneVertical_IntStrideOverflow) {
delete[] src;
delete[] dst;
}
struct ScaleTestParams {
std::vector<std::vector<int>> src_data;
std::vector<std::vector<int>> dst_data;
FilterMode filter_mode;
};
class ScalePlaneTest
: public LibYUVScaleTest,
public ::testing::WithParamInterface<std::tuple<ScaleTestParams, bool>> {
};
TEST_P(ScalePlaneTest, Mismatch) {
const ScaleTestParams& params = std::get<0>(GetParam());
bool use_c_only = std::get<1>(GetParam());
const int src_height = params.src_data.size();
const int src_width = params.src_data[0].size();
const int dst_height = params.dst_data.size();
const int dst_width = params.dst_data[0].size();
const int size = src_width * src_height;
align_buffer_page_end(src_pixels, size);
align_buffer_page_end(dst_pixels, dst_width * dst_height);
for (int y = 0; y < src_height; ++y) {
for (int x = 0; x < src_width; ++x) {
src_pixels[y * src_width + x] =
static_cast<uint8_t>(params.src_data[y][x]);
}
}
memset(dst_pixels, 0, dst_width * dst_height);
MaskCpuFlags(use_c_only ? disable_cpu_flags_ : benchmark_cpu_info_);
ScalePlane(src_pixels, src_width, src_width, src_height, dst_pixels,
dst_width, dst_width, dst_height, params.filter_mode);
bool has_mismatch = false;
for (int y = 0; y < dst_height; ++y) {
for (int x = 0; x < dst_width; ++x) {
if (dst_pixels[y * dst_width + x] !=
static_cast<uint8_t>(params.dst_data[y][x])) {
has_mismatch = true;
break;
}
}
if (has_mismatch)
break;
}
EXPECT_FALSE(has_mismatch);
free_aligned_buffer_page_end(dst_pixels);
free_aligned_buffer_page_end(src_pixels);
}
TEST_P(ScalePlaneTest, Mismatch16) {
const ScaleTestParams& params = std::get<0>(GetParam());
bool use_c_only = std::get<1>(GetParam());
const int src_height = params.src_data.size();
const int src_width = params.src_data[0].size();
const int dst_height = params.dst_data.size();
const int dst_width = params.dst_data[0].size();
const int size = src_width * src_height;
align_buffer_page_end(src_pixels, size * 2);
align_buffer_page_end(dst_pixels, dst_width * dst_height * 2);
uint16_t* src_ptr = reinterpret_cast<uint16_t*>(src_pixels);
uint16_t* dst_ptr = reinterpret_cast<uint16_t*>(dst_pixels);
for (int y = 0; y < src_height; ++y) {
for (int x = 0; x < src_width; ++x) {
src_ptr[y * src_width + x] = static_cast<uint16_t>(params.src_data[y][x]);
}
}
memset(dst_ptr, 0, dst_width * dst_height * 2);
MaskCpuFlags(use_c_only ? disable_cpu_flags_ : benchmark_cpu_info_);
ScalePlane_16(src_ptr, src_width, src_width, src_height, dst_ptr, dst_width,
dst_width, dst_height, params.filter_mode);
bool has_mismatch = false;
for (int y = 0; y < dst_height; ++y) {
for (int x = 0; x < dst_width; ++x) {
if (dst_ptr[y * dst_width + x] !=
static_cast<uint16_t>(params.dst_data[y][x])) {
has_mismatch = true;
break;
}
}
if (has_mismatch)
break;
}
EXPECT_FALSE(has_mismatch);
free_aligned_buffer_page_end(dst_pixels);
free_aligned_buffer_page_end(src_pixels);
}
INSTANTIATE_TEST_SUITE_P(
All,
ScalePlaneTest,
Combine(
Values(
// ScaleRowDown2Box
ScaleTestParams{.src_data = {{1, 3, 1, 3, 1, 3, 1, 3},
{1, 3, 1, 3, 1, 3, 1, 3},
{1, 3, 1, 3, 1, 3, 1, 3},
{1, 3, 1, 3, 1, 3, 1, 3},
{1, 3, 1, 3, 1, 3, 1, 3},
{1, 3, 1, 3, 1, 3, 1, 3},
{1, 3, 1, 3, 1, 3, 1, 3},
{1, 3, 1, 3, 1, 3, 1, 3}},
.dst_data = {{2, 2, 2, 2},
{2, 2, 2, 2},
{2, 2, 2, 2},
{2, 2, 2, 2}},
.filter_mode = kFilterBox},
// ScaleRowDown4Box
ScaleTestParams{.src_data = {{1, 3, 1, 3, 1, 3, 1, 3},
{1, 3, 1, 3, 1, 3, 1, 3},
{1, 3, 1, 3, 1, 3, 1, 3},
{1, 3, 1, 3, 1, 3, 1, 3},
{1, 3, 1, 3, 1, 3, 1, 3},
{1, 3, 1, 3, 1, 3, 1, 3},
{1, 3, 1, 3, 1, 3, 1, 3},
{1, 3, 1, 3, 1, 3, 1, 3}},
.dst_data = {{2, 2}, {2, 2}},
.filter_mode = kFilterBox},
// ScaleRowDown38Box
ScaleTestParams{
.src_data =
std::vector<std::vector<int>>(8, std::vector<int>(8, 1)),
.dst_data =
std::vector<std::vector<int>>(3, std::vector<int>(3, 1)),
.filter_mode = kFilterBox},
// ScaleAddCols1_C (1/3)
ScaleTestParams{
.src_data =
std::vector<std::vector<int>>(6, std::vector<int>(6, 1)),
.dst_data =
std::vector<std::vector<int>>(2, std::vector<int>(2, 1)),
.filter_mode = kFilterBox},
// ScaleAddCols2_C (5/12)
ScaleTestParams{
.src_data =
std::vector<std::vector<int>>(12, std::vector<int>(12, 1)),
.dst_data =
std::vector<std::vector<int>>(5, std::vector<int>(5, 1)),
.filter_mode = kFilterBox}),
Values(false, true)));
} // namespace libyuv