Rounding added to scaling upsampler

Bug: libyuv:872, b/178521093
Change-Id: I86749f73f5e55d5fd8b87ea6938084cbacb1cda7
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2686945
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2021-02-10 00:22:41 -08:00 committed by Frank Barchard
parent 742791f13a
commit 12a4a2372c
11 changed files with 251 additions and 214 deletions

View File

@ -710,8 +710,8 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
const PTYPE* sb = src_ptr + src_stride; \ const PTYPE* sb = src_ptr + src_stride; \
PTYPE* da = dst_ptr; \ PTYPE* da = dst_ptr; \
PTYPE* db = dst_ptr + dst_stride; \ PTYPE* db = dst_ptr + dst_stride; \
da[0] = (3 * sa[0] + sb[0]) >> 2; \ da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \
db[0] = (sa[0] + 3 * sb[0]) >> 2; \ db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \
if (work_width > 0) { \ if (work_width > 0) { \
if (n != 0) { \ if (n != 0) { \
SIMD(sa, sb - sa, da + 1, db - da, n); \ SIMD(sa, sb - sa, da + 1, db - da, n); \
@ -719,9 +719,9 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \ C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \
} \ } \
da[dst_width - 1] = \ da[dst_width - 1] = \
(3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2]) >> 2; \ (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \
db[dst_width - 1] = \ db[dst_width - 1] = \
(sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2]) >> 2; \ (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \
} }
SU2BLANY(ScaleRowUp2_Bilinear_Any_C, SU2BLANY(ScaleRowUp2_Bilinear_Any_C,
@ -856,10 +856,10 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
const PTYPE* sb = src_ptr + src_stride; \ const PTYPE* sb = src_ptr + src_stride; \
PTYPE* da = dst_ptr; \ PTYPE* da = dst_ptr; \
PTYPE* db = dst_ptr + dst_stride; \ PTYPE* db = dst_ptr + dst_stride; \
da[0] = (3 * sa[0] + sb[0]) >> 2; \ da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \
db[0] = (sa[0] + 3 * sb[0]) >> 2; \ db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \
da[1] = (3 * sa[1] + sb[1]) >> 2; \ da[1] = (3 * sa[1] + sb[1] + 2) >> 2; \
db[1] = (sa[1] + 3 * sb[1]) >> 2; \ db[1] = (sa[1] + 3 * sb[1] + 2) >> 2; \
if (work_width > 0) { \ if (work_width > 0) { \
if (n != 0) { \ if (n != 0) { \
SIMD(sa, sb - sa, da + 2, db - da, n); \ SIMD(sa, sb - sa, da + 2, db - da, n); \
@ -867,13 +867,17 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
C(sa + n, sb - sa, da + 2 * n + 2, db - da, r); \ C(sa + n, sb - sa, da + 2 * n + 2, db - da, r); \
} \ } \
da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] + \ da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] + \
sb[((dst_width + 1) & ~1) - 2]) >> 2; \ sb[((dst_width + 1) & ~1) - 2] + 2) >> \
2; \
db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] + \ db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] + \
3 * sb[((dst_width + 1) & ~1) - 2]) >> 2; \ 3 * sb[((dst_width + 1) & ~1) - 2] + 2) >> \
2; \
da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] + \ da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] + \
sb[((dst_width + 1) & ~1) - 1]) >> 2; \ sb[((dst_width + 1) & ~1) - 1] + 2) >> \
2; \
db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] + \ db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] + \
3 * sb[((dst_width + 1) & ~1) - 1]) >> 2; \ 3 * sb[((dst_width + 1) & ~1) - 1] + 2) >> \
2; \
} }
SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C, SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C,

View File

@ -1232,21 +1232,29 @@ void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
assert((dst_width % 2 == 0) && (dst_width >= 0)); assert((dst_width % 2 == 0) && (dst_width >= 0));
for (x = 0; x < src_width; ++x) { for (x = 0; x < src_width; ++x) {
d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
t[2 * x + 2] * 1 + 8) >> 4; t[2 * x + 2] * 1 + 8) >>
4;
d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
t[2 * x + 3] * 1 + 8) >> 4; t[2 * x + 3] * 1 + 8) >>
4;
d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 + d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
t[2 * x + 2] * 3 + 8) >> 4; t[2 * x + 2] * 3 + 8) >>
4;
d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 + d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
t[2 * x + 3] * 3 + 8) >> 4; t[2 * x + 3] * 3 + 8) >>
4;
e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 + e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
t[2 * x + 2] * 3 + 8) >> 4; t[2 * x + 2] * 3 + 8) >>
4;
e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 + e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
t[2 * x + 3] * 3 + 8) >> 4; t[2 * x + 3] * 3 + 8) >>
4;
e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
t[2 * x + 2] * 9 + 8) >> 4; t[2 * x + 2] * 9 + 8) >>
4;
e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
t[2 * x + 3] * 9 + 8) >> 4; t[2 * x + 3] * 9 + 8) >>
4;
} }
} }

View File

@ -196,8 +196,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x20(%0),%%ymm1 \n"
@ -483,8 +482,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
"m"(kShuf1), // %1 "m"(kShuf1), // %1
"m"(kShuf2) // %2 "m"(kShuf2) // %2
); );
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm2 \n" "movdqu 0x10(%0),%%xmm2 \n"
@ -529,8 +527,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
"m"(kMadd11), // %1 "m"(kMadd11), // %1
"m"(kRound34) // %2 "m"(kRound34) // %2
); );
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm6 \n" "movdqu (%0),%%xmm6 \n"
"movdqu 0x00(%0,%3,1),%%xmm7 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n"
@ -568,8 +565,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3 : "r"((intptr_t)(src_stride)), // %3
"m"(kMadd21) // %4 "m"(kMadd21) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"xmm7"); "xmm6", "xmm7");
} }
void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
@ -595,8 +592,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
"m"(kRound34) // %2 "m"(kRound34) // %2
); );
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm6 \n" "movdqu (%0),%%xmm6 \n"
"movdqu 0x00(%0,%3,1),%%xmm7 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n"
@ -637,8 +633,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3 : "r"((intptr_t)(src_stride)), // %3
"m"(kMadd21) // %4 "m"(kMadd21) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"xmm7"); "xmm6", "xmm7");
} }
void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
@ -687,8 +683,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShufAb2), // %2 "m"(kShufAb2), // %2
"m"(kScaleAb2) // %3 "m"(kScaleAb2) // %3
); );
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%3,1),%%xmm1 \n" "movdqu 0x00(%0,%3,1),%%xmm1 \n"
@ -713,7 +708,8 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3 : "r"((intptr_t)(src_stride)) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"xmm6");
} }
void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
@ -730,8 +726,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShufAc3), // %1 "m"(kShufAc3), // %1
"m"(kScaleAc33) // %2 "m"(kScaleAc33) // %2
); );
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%3,1),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm6 \n"
@ -775,8 +770,8 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3 : "r"((intptr_t)(src_stride)) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"xmm7"); "xmm6", "xmm7");
} }
#ifdef HAS_SCALEROWUP2LINEAR_SSE2 #ifdef HAS_SCALEROWUP2LINEAR_SSE2
@ -1601,8 +1596,7 @@ void ScaleRowUp2_Bilinear_16_AVX2_Full(const uint16_t* src_ptr,
void ScaleAddRow_SSE2(const uint8_t* src_ptr, void ScaleAddRow_SSE2(const uint8_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int src_width) { int src_width) {
asm volatile( asm volatile("pxor %%xmm5,%%xmm5 \n"
"pxor %%xmm5,%%xmm5 \n"
// 16 pixel loop. // 16 pixel loop.
LABELALIGN LABELALIGN
@ -1633,8 +1627,7 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
void ScaleAddRow_AVX2(const uint8_t* src_ptr, void ScaleAddRow_AVX2(const uint8_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int src_width) { int src_width) {
asm volatile( asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -1772,8 +1765,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
int dx) { int dx) {
(void)x; (void)x;
(void)dx; (void)dx;
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"movdqu (%1),%%xmm0 \n" "movdqu (%1),%%xmm0 \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
@ -1798,8 +1790,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
@ -1821,8 +1812,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
@ -1846,8 +1836,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
@ -2032,8 +2021,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
int dx) { int dx) {
(void)x; (void)x;
(void)dx; (void)dx;
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"movdqu (%1),%%xmm0 \n" "movdqu (%1),%%xmm0 \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"

View File

@ -194,21 +194,21 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
"vmlal.u8 q10, d2, d24 \n" "vmlal.u8 q10, d2, d24 \n"
"vmlal.u8 q11, d3, d24 \n" "vmlal.u8 q11, d3, d24 \n"
// (3 * line_0 + line_1) >> 2 // (3 * line_0 + line_1 + 2) >> 2
"vqrshrn.u16 d0, q8, #2 \n" "vqrshrn.u16 d0, q8, #2 \n"
"vqrshrn.u16 d1, q9, #2 \n" "vqrshrn.u16 d1, q9, #2 \n"
"vqrshrn.u16 d2, q10, #2 \n" "vqrshrn.u16 d2, q10, #2 \n"
"vqrshrn.u16 d3, q11, #2 \n" "vqrshrn.u16 d3, q11, #2 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2 // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
"vmovl.u8 q8, d1 \n" "vmovl.u8 q8, d1 \n"
"vmlal.u8 q8, d0, d24 \n" "vmlal.u8 q8, d0, d24 \n"
"vqrshrn.u16 d0, q8, #2 \n" "vqrshrn.u16 d0, q8, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1 // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
"vrhadd.u8 d1, d1, d2 \n" "vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2 // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
"vmovl.u8 q8, d2 \n" "vmovl.u8 q8, d2 \n"
"vmlal.u8 q8, d3, d24 \n" "vmlal.u8 q8, d3, d24 \n"
"vqrshrn.u16 d2, q8, #2 \n" "vqrshrn.u16 d2, q8, #2 \n"
@ -240,15 +240,15 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
"vrhadd.u8 q0, q0, q2 \n" "vrhadd.u8 q0, q0, q2 \n"
"vrhadd.u8 q1, q1, q3 \n" "vrhadd.u8 q1, q1, q3 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2 // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
"vmovl.u8 q3, d1 \n" "vmovl.u8 q3, d1 \n"
"vmlal.u8 q3, d0, d24 \n" "vmlal.u8 q3, d0, d24 \n"
"vqrshrn.u16 d0, q3, #2 \n" "vqrshrn.u16 d0, q3, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1 // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
"vrhadd.u8 d1, d1, d2 \n" "vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2 // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
"vmovl.u8 q3, d2 \n" "vmovl.u8 q3, d2 \n"
"vmlal.u8 q3, d3, d24 \n" "vmlal.u8 q3, d3, d24 \n"
"vqrshrn.u16 d2, q3, #2 \n" "vqrshrn.u16 d2, q3, #2 \n"

View File

@ -201,22 +201,22 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
"umlal v19.8h, v3.8b, v20.8b \n" "umlal v19.8h, v3.8b, v20.8b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// (3 * line_0 + line_1) >> 2 // (3 * line_0 + line_1 + 2) >> 2
"uqrshrn v0.8b, v16.8h, #2 \n" "uqrshrn v0.8b, v16.8h, #2 \n"
"uqrshrn v1.8b, v17.8h, #2 \n" "uqrshrn v1.8b, v17.8h, #2 \n"
"uqrshrn v2.8b, v18.8h, #2 \n" "uqrshrn v2.8b, v18.8h, #2 \n"
"uqrshrn v3.8b, v19.8h, #2 \n" "uqrshrn v3.8b, v19.8h, #2 \n"
"prfm pldl1keep, [%3, 448] \n" "prfm pldl1keep, [%3, 448] \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2 // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
"ushll v16.8h, v1.8b, #0 \n" "ushll v16.8h, v1.8b, #0 \n"
"umlal v16.8h, v0.8b, v20.8b \n" "umlal v16.8h, v0.8b, v20.8b \n"
"uqrshrn v0.8b, v16.8h, #2 \n" "uqrshrn v0.8b, v16.8h, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1 // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
"urhadd v1.8b, v1.8b, v2.8b \n" "urhadd v1.8b, v1.8b, v2.8b \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2 // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
"ushll v16.8h, v2.8b, #0 \n" "ushll v16.8h, v2.8b, #0 \n"
"umlal v16.8h, v3.8b, v20.8b \n" "umlal v16.8h, v3.8b, v20.8b \n"
"uqrshrn v2.8b, v16.8h, #2 \n" "uqrshrn v2.8b, v16.8h, #2 \n"
@ -251,16 +251,16 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
"urhadd v3.8b, v3.8b, v7.8b \n" "urhadd v3.8b, v3.8b, v7.8b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// a0 = (src[0] * 3 + s[1] * 1) >> 2 // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
"ushll v4.8h, v1.8b, #0 \n" "ushll v4.8h, v1.8b, #0 \n"
"umlal v4.8h, v0.8b, v20.8b \n" "umlal v4.8h, v0.8b, v20.8b \n"
"uqrshrn v0.8b, v4.8h, #2 \n" "uqrshrn v0.8b, v4.8h, #2 \n"
"prfm pldl1keep, [%3, 448] \n" "prfm pldl1keep, [%3, 448] \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1 // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
"urhadd v1.8b, v1.8b, v2.8b \n" "urhadd v1.8b, v1.8b, v2.8b \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2 // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
"ushll v4.8h, v2.8b, #0 \n" "ushll v4.8h, v2.8b, #0 \n"
"umlal v4.8h, v3.8b, v20.8b \n" "umlal v4.8h, v3.8b, v20.8b \n"
"uqrshrn v2.8b, v4.8h, #2 \n" "uqrshrn v2.8b, v4.8h, #2 \n"

View File

@ -690,8 +690,7 @@ void ScaleUVLinearUp2(int src_width,
#endif #endif
if (dst_height == 1) { if (dst_height == 1) {
ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, dst_width);
dst_width);
} else { } else {
dy = FixedDiv(src_height - 1, dst_height - 1); dy = FixedDiv(src_height - 1, dst_height - 1);
y = (1 << 15) - 1; y = (1 << 15) - 1;

View File

@ -609,9 +609,15 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
// BT.601 limited range. // BT.601 limited range.
TEST_F(LibYUVColorTest, TestFullYUV) { TEST_F(LibYUVColorTest, TestFullYUV) {
int rh[256] = { 0, }; int rh[256] = {
int gh[256] = { 0, }; 0,
int bh[256] = { 0, }; };
int gh[256] = {
0,
};
int bh[256] = {
0,
};
for (int u = 0; u < 256; ++u) { for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) { for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@ -633,9 +639,15 @@ TEST_F(LibYUVColorTest, TestFullYUV) {
// BT.601 full range. // BT.601 full range.
TEST_F(LibYUVColorTest, TestFullYUVJ) { TEST_F(LibYUVColorTest, TestFullYUVJ) {
int rh[256] = { 0, }; int rh[256] = {
int gh[256] = { 0, }; 0,
int bh[256] = { 0, }; };
int gh[256] = {
0,
};
int bh[256] = {
0,
};
for (int u = 0; u < 256; ++u) { for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) { for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@ -657,9 +669,15 @@ TEST_F(LibYUVColorTest, TestFullYUVJ) {
// BT.709 limited range. // BT.709 limited range.
TEST_F(LibYUVColorTest, TestFullYUVH) { TEST_F(LibYUVColorTest, TestFullYUVH) {
int rh[256] = { 0, }; int rh[256] = {
int gh[256] = { 0, }; 0,
int bh[256] = { 0, }; };
int gh[256] = {
0,
};
int bh[256] = {
0,
};
for (int u = 0; u < 256; ++u) { for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) { for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@ -682,9 +700,15 @@ TEST_F(LibYUVColorTest, TestFullYUVH) {
// BT.709 full range. // BT.709 full range.
TEST_F(LibYUVColorTest, TestFullYUVF) { TEST_F(LibYUVColorTest, TestFullYUVF) {
int rh[256] = { 0, }; int rh[256] = {
int gh[256] = { 0, }; 0,
int bh[256] = { 0, }; };
int gh[256] = {
0,
};
int bh[256] = {
0,
};
for (int u = 0; u < 256; ++u) { for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) { for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@ -706,9 +730,15 @@ TEST_F(LibYUVColorTest, TestFullYUVF) {
// BT.2020 limited range. // BT.2020 limited range.
TEST_F(LibYUVColorTest, TestFullYUVU) { TEST_F(LibYUVColorTest, TestFullYUVU) {
int rh[256] = { 0, }; int rh[256] = {
int gh[256] = { 0, }; 0,
int bh[256] = { 0, }; };
int gh[256] = {
0,
};
int bh[256] = {
0,
};
for (int u = 0; u < 256; ++u) { for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) { for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@ -731,9 +761,15 @@ TEST_F(LibYUVColorTest, TestFullYUVU) {
// BT.2020 full range. // BT.2020 full range.
TEST_F(LibYUVColorTest, TestFullYUVV) { TEST_F(LibYUVColorTest, TestFullYUVV) {
int rh[256] = { 0, }; int rh[256] = {
int gh[256] = { 0, }; 0,
int bh[256] = { 0, }; };
int gh[256] = {
0,
};
int bh[256] = {
0,
};
for (int u = 0; u < 256; ++u) { for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) { for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {

View File

@ -18,11 +18,15 @@
// For those MCs that can be represented as kr and kb: // For those MCs that can be represented as kr and kb:
// Full range // Full range
// float M[3][3] {{1,0,2*(1-kr)},{1,-((2*kb)/((2-kb)*(1-kb-kr))),-((2*kr)/((2-kr)*(1-kb-kr)))},{1,2*(1-kb),0}}; // float M[3][3]
// float B[3] {1+(256*(1-kr))/255,1-(256*kb)/(255*(2-kb)*(1-kb-kr))-(256*kr)/(255*(2-kr)*(1-kb-kr)),1+(256*(1-kb))/255}; // {{1,0,2*(1-kr)},{1,-((2*kb)/((2-kb)*(1-kb-kr))),-((2*kr)/((2-kr)*(1-kb-kr)))},{1,2*(1-kb),0}};
// float B[3]
// {1+(256*(1-kr))/255,1-(256*kb)/(255*(2-kb)*(1-kb-kr))-(256*kr)/(255*(2-kr)*(1-kb-kr)),1+(256*(1-kb))/255};
// Limited range // Limited range
// float M[3][3] {{85/73,0,255/112-(255*kr)/112},{85/73,-((255*kb)/(112*(2-kb)*(1-kb-kr))),-((255*kr)/(112*(2-kr)*(1-kb-kr)))},{85/73,255/112-(255*kb)/112,0}}; // float M[3][3]
// float B[3] {77662/43435-(1537*kr)/1785,203/219-(1537*kb)/(1785*(2-kb)*(1-kb-kr))-(1537*kr)/(1785*(2-kr)*(1-kb-kr)),77662/43435-(1537*kb)/1785}; // {{85/73,0,255/112-(255*kr)/112},{85/73,-((255*kb)/(112*(2-kb)*(1-kb-kr))),-((255*kr)/(112*(2-kr)*(1-kb-kr)))},{85/73,255/112-(255*kb)/112,0}};
// float B[3]
// {77662/43435-(1537*kr)/1785,203/219-(1537*kb)/(1785*(2-kb)*(1-kb-kr))-(1537*kr)/(1785*(2-kr)*(1-kb-kr)),77662/43435-(1537*kb)/1785};
// mc bt // mc bt
// 1 bt.709 KR = 0.2126; KB = 0.0722 // 1 bt.709 KR = 0.2126; KB = 0.0722
@ -60,7 +64,6 @@ int round(float v) {
} }
int main(int argc, const char* argv[]) { int main(int argc, const char* argv[]) {
if (argc < 2) { if (argc < 2) {
printf("color kr kb\n"); printf("color kr kb\n");
return -1; return -1;
@ -115,4 +118,3 @@ int main(int argc, const char* argv[]) {
return 0; return 0;
} }