diff --git a/source/convert.cc b/source/convert.cc index 8a4fcf06e..06d312f96 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -15,8 +15,8 @@ #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" #include "libyuv/row.h" -#include "libyuv/scale.h" // For ScalePlane() -#include "libyuv/scale_uv.h" // For UVScale() +#include "libyuv/scale.h" // For ScalePlane() +#include "libyuv/scale_uv.h" // For UVScale() #ifdef __cplusplus namespace libyuv { diff --git a/source/row_gcc.cc b/source/row_gcc.cc index f91110034..1aea6db9e 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -4374,15 +4374,15 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb, "lea 8(%1),%1 \n" "subl $0x8,%5 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(dst_a), // %4 + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(dst_a), // %4 #if defined(__i386__) - "+m"(width) // %5 + "+m"(width) // %5 #else - "+rm"(width) // %5 + "+rm"(width) // %5 #endif : "m"(kShuffleMaskARGBSplit) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2"); @@ -4465,15 +4465,15 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb, "lea 16(%1),%1 \n" "subl $0x10,%5 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(dst_a), // %4 + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(dst_a), // %4 #if defined(__i386__) - "+m"(width) // %5 + "+m"(width) // %5 #else - "+rm"(width) // %5 + "+rm"(width) // %5 #endif : "m"(kShuffleMaskARGBSplit_AVX2), // %6 "m"(kShuffleMaskARGBPermute_AVX2) // %7 @@ -7186,7 +7186,7 @@ void HalfFloatRow_AVX2(const uint16_t* src, #if defined(__x86_64__) : "x"(scale) // %3 #else - : "m"(scale) // %3 + : "m"(scale) // %3 #endif : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); } @@ -7224,7 +7224,7 @@ void HalfFloatRow_F16C(const uint16_t* src, #if defined(__x86_64__) : "x"(scale) // %3 #else - : "m"(scale) // %3 + : "m"(scale) // %3 #endif : "memory", "cc", "xmm2", "xmm3", "xmm4"); } diff --git a/source/scale_any.cc b/source/scale_any.cc index 4257d17b9..47b283863 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -700,28 +700,28 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON, // Scale up 2 times using bilinear filter. // This function produces 2 rows at a time. -#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \ - void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \ - ptrdiff_t dst_stride, int dst_width) { \ - int work_width = (dst_width - 1) & ~1; \ - int r = work_width & MASK; \ - int n = work_width & ~MASK; \ - const PTYPE* sa = src_ptr; \ - const PTYPE* sb = src_ptr + src_stride; \ - PTYPE* da = dst_ptr; \ - PTYPE* db = dst_ptr + dst_stride; \ - da[0] = (3 * sa[0] + sb[0]) >> 2; \ - db[0] = (sa[0] + 3 * sb[0]) >> 2; \ - if (work_width > 0) { \ - if (n != 0) { \ - SIMD(sa, sb - sa, da + 1, db - da, n); \ - } \ - C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \ - } \ - da[dst_width - 1] = \ - (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2]) >> 2; \ - db[dst_width - 1] = \ - (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2]) >> 2; \ +#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \ + void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \ + ptrdiff_t dst_stride, int dst_width) { \ + int work_width = (dst_width - 1) & ~1; \ + int r = work_width & MASK; \ + int n = work_width & ~MASK; \ + const PTYPE* sa = src_ptr; \ + const PTYPE* sb = src_ptr + src_stride; \ + PTYPE* da = dst_ptr; \ + PTYPE* db = dst_ptr + dst_stride; \ + da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \ + db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \ + if (work_width > 0) { \ + if (n != 0) { \ + SIMD(sa, sb - sa, da + 1, db - da, n); \ + } \ + C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \ + } \ + da[dst_width - 1] = \ + (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \ + db[dst_width - 1] = \ + (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \ } SU2BLANY(ScaleRowUp2_Bilinear_Any_C, @@ -856,10 +856,10 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON, const PTYPE* sb = src_ptr + src_stride; \ PTYPE* da = dst_ptr; \ PTYPE* db = dst_ptr + dst_stride; \ - da[0] = (3 * sa[0] + sb[0]) >> 2; \ - db[0] = (sa[0] + 3 * sb[0]) >> 2; \ - da[1] = (3 * sa[1] + sb[1]) >> 2; \ - db[1] = (sa[1] + 3 * sb[1]) >> 2; \ + da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \ + db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \ + da[1] = (3 * sa[1] + sb[1] + 2) >> 2; \ + db[1] = (sa[1] + 3 * sb[1] + 2) >> 2; \ if (work_width > 0) { \ if (n != 0) { \ SIMD(sa, sb - sa, da + 2, db - da, n); \ @@ -867,13 +867,17 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON, C(sa + n, sb - sa, da + 2 * n + 2, db - da, r); \ } \ da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] + \ - sb[((dst_width + 1) & ~1) - 2]) >> 2; \ + sb[((dst_width + 1) & ~1) - 2] + 2) >> \ + 2; \ db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] + \ - 3 * sb[((dst_width + 1) & ~1) - 2]) >> 2; \ + 3 * sb[((dst_width + 1) & ~1) - 2] + 2) >> \ + 2; \ da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] + \ - sb[((dst_width + 1) & ~1) - 1]) >> 2; \ + sb[((dst_width + 1) & ~1) - 1] + 2) >> \ + 2; \ db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] + \ - 3 * sb[((dst_width + 1) & ~1) - 1]) >> 2; \ + 3 * sb[((dst_width + 1) & ~1) - 1] + 2) >> \ + 2; \ } SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C, diff --git a/source/scale_common.cc b/source/scale_common.cc index 4af843216..f4f233973 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -1232,21 +1232,29 @@ void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr, assert((dst_width % 2 == 0) && (dst_width >= 0)); for (x = 0; x < src_width; ++x) { d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + - t[2 * x + 2] * 1 + 8) >> 4; + t[2 * x + 2] * 1 + 8) >> + 4; d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + - t[2 * x + 3] * 1 + 8) >> 4; + t[2 * x + 3] * 1 + 8) >> + 4; d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 + - t[2 * x + 2] * 3 + 8) >> 4; + t[2 * x + 2] * 3 + 8) >> + 4; d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 + - t[2 * x + 3] * 3 + 8) >> 4; + t[2 * x + 3] * 3 + 8) >> + 4; e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 + - t[2 * x + 2] * 3 + 8) >> 4; + t[2 * x + 2] * 3 + 8) >> + 4; e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 + - t[2 * x + 3] * 3 + 8) >> 4; + t[2 * x + 3] * 3 + 8) >> + 4; e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + - t[2 * x + 2] * 9 + 8) >> 4; + t[2 * x + 2] * 9 + 8) >> + 4; e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + - t[2 * x + 3] * 9 + 8) >> 4; + t[2 * x + 3] * 9 + 8) >> + 4; } } diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index 226e0a956..9563e5bb6 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -196,8 +196,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile( - LABELALIGN + asm volatile(LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -211,11 +210,11 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr, "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, @@ -483,8 +482,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, "m"(kShuf1), // %1 "m"(kShuf2) // %2 ); - asm volatile( - LABELALIGN + asm volatile(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm2 \n" @@ -500,11 +498,11 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, "lea 0x18(%1),%1 \n" "sub $0x18,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, @@ -529,8 +527,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, "m"(kMadd11), // %1 "m"(kRound34) // %2 ); - asm volatile( - LABELALIGN + asm volatile(LABELALIGN "1: \n" "movdqu (%0),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n" @@ -563,13 +560,13 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, "lea 0x18(%1),%1 \n" "sub $0x18,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); } void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, @@ -595,8 +592,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, "m"(kRound34) // %2 ); - asm volatile( - LABELALIGN + asm volatile(LABELALIGN "1: \n" "movdqu (%0),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n" @@ -632,13 +628,13 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, "lea 0x18(%1),%1 \n" "sub $0x18,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); } void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, @@ -687,8 +683,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, "m"(kShufAb2), // %2 "m"(kScaleAb2) // %3 ); - asm volatile( - LABELALIGN + asm volatile(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%3,1),%%xmm1 \n" @@ -709,11 +704,12 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, "lea 0x6(%1),%1 \n" "sub $0x6,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6"); } void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, @@ -730,8 +726,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, "m"(kShufAc3), // %1 "m"(kScaleAc33) // %2 ); - asm volatile( - LABELALIGN + asm volatile(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%3,1),%%xmm6 \n" @@ -771,12 +766,12 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, "lea 0x6(%1),%1 \n" "sub $0x6,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); } #ifdef HAS_SCALEROWUP2LINEAR_SSE2 @@ -1601,11 +1596,10 @@ void ScaleRowUp2_Bilinear_16_AVX2_Full(const uint16_t* src_ptr, void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - asm volatile( - "pxor %%xmm5,%%xmm5 \n" + asm volatile("pxor %%xmm5,%%xmm5 \n" - // 16 pixel loop. - LABELALIGN + // 16 pixel loop. + LABELALIGN "1: \n" "movdqu (%0),%%xmm3 \n" "lea 0x10(%0),%0 \n" // src_ptr += 16 @@ -1621,11 +1615,11 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr, "lea 0x20(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #ifdef HAS_SCALEADDROW_AVX2 @@ -1633,10 +1627,9 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr, void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - asm volatile( - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN + LABELALIGN "1: \n" "vmovdqu (%0),%%ymm3 \n" "lea 0x20(%0),%0 \n" // src_ptr += 32 @@ -1651,11 +1644,11 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr, "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SCALEADDROW_AVX2 @@ -1772,8 +1765,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr, int dx) { (void)x; (void)dx; - asm volatile( - LABELALIGN + asm volatile(LABELALIGN "1: \n" "movdqu (%1),%%xmm0 \n" "lea 0x10(%1),%1 \n" @@ -1786,11 +1778,11 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr, "sub $0x20,%2 \n" "jg 1b \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, @@ -1798,8 +1790,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile( - LABELALIGN + asm volatile(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1809,11 +1800,11 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, @@ -1821,8 +1812,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile( - LABELALIGN + asm volatile(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1835,19 +1825,18 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { - asm volatile( - LABELALIGN + asm volatile(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1864,11 +1853,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } // Reads 4 pixels at a time. @@ -2032,8 +2021,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, int dx) { (void)x; (void)dx; - asm volatile( - LABELALIGN + asm volatile(LABELALIGN "1: \n" "movdqu (%1),%%xmm0 \n" "lea 0x10(%1),%1 \n" @@ -2046,11 +2034,11 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } // Shuffle table for arranging 2 pixels into pairs for pmaddubsw @@ -2381,7 +2369,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)), // %4 - "m"(kUVLinearMadd31_SSSE3) // %5 + "m"(kUVLinearMadd31_SSSE3) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } diff --git a/source/scale_neon.cc b/source/scale_neon.cc index fea3e64e1..e65654d92 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -194,21 +194,21 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, "vmlal.u8 q10, d2, d24 \n" "vmlal.u8 q11, d3, d24 \n" - // (3 * line_0 + line_1) >> 2 + // (3 * line_0 + line_1 + 2) >> 2 "vqrshrn.u16 d0, q8, #2 \n" "vqrshrn.u16 d1, q9, #2 \n" "vqrshrn.u16 d2, q10, #2 \n" "vqrshrn.u16 d3, q11, #2 \n" - // a0 = (src[0] * 3 + s[1] * 1) >> 2 + // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 "vmovl.u8 q8, d1 \n" "vmlal.u8 q8, d0, d24 \n" "vqrshrn.u16 d0, q8, #2 \n" - // a1 = (src[1] * 1 + s[2] * 1) >> 1 + // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 "vrhadd.u8 d1, d1, d2 \n" - // a2 = (src[2] * 1 + s[3] * 3) >> 2 + // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 "vmovl.u8 q8, d2 \n" "vmlal.u8 q8, d3, d24 \n" "vqrshrn.u16 d2, q8, #2 \n" @@ -240,15 +240,15 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, "vrhadd.u8 q0, q0, q2 \n" "vrhadd.u8 q1, q1, q3 \n" - // a0 = (src[0] * 3 + s[1] * 1) >> 2 + // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 "vmovl.u8 q3, d1 \n" "vmlal.u8 q3, d0, d24 \n" "vqrshrn.u16 d0, q3, #2 \n" - // a1 = (src[1] * 1 + s[2] * 1) >> 1 + // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 "vrhadd.u8 d1, d1, d2 \n" - // a2 = (src[2] * 1 + s[3] * 3) >> 2 + // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 "vmovl.u8 q3, d2 \n" "vmlal.u8 q3, d3, d24 \n" "vqrshrn.u16 d2, q3, #2 \n" diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 3a3d499dc..03a798cd4 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -201,22 +201,22 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, "umlal v19.8h, v3.8b, v20.8b \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - // (3 * line_0 + line_1) >> 2 + // (3 * line_0 + line_1 + 2) >> 2 "uqrshrn v0.8b, v16.8h, #2 \n" "uqrshrn v1.8b, v17.8h, #2 \n" "uqrshrn v2.8b, v18.8h, #2 \n" "uqrshrn v3.8b, v19.8h, #2 \n" "prfm pldl1keep, [%3, 448] \n" - // a0 = (src[0] * 3 + s[1] * 1) >> 2 + // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 "ushll v16.8h, v1.8b, #0 \n" "umlal v16.8h, v0.8b, v20.8b \n" "uqrshrn v0.8b, v16.8h, #2 \n" - // a1 = (src[1] * 1 + s[2] * 1) >> 1 + // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 "urhadd v1.8b, v1.8b, v2.8b \n" - // a2 = (src[2] * 1 + s[3] * 3) >> 2 + // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 "ushll v16.8h, v2.8b, #0 \n" "umlal v16.8h, v3.8b, v20.8b \n" "uqrshrn v2.8b, v16.8h, #2 \n" @@ -251,16 +251,16 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, "urhadd v3.8b, v3.8b, v7.8b \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - // a0 = (src[0] * 3 + s[1] * 1) >> 2 + // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 "ushll v4.8h, v1.8b, #0 \n" "umlal v4.8h, v0.8b, v20.8b \n" "uqrshrn v0.8b, v4.8h, #2 \n" "prfm pldl1keep, [%3, 448] \n" - // a1 = (src[1] * 1 + s[2] * 1) >> 1 + // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 "urhadd v1.8b, v1.8b, v2.8b \n" - // a2 = (src[2] * 1 + s[3] * 3) >> 2 + // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 "ushll v4.8h, v2.8b, #0 \n" "umlal v4.8h, v3.8b, v20.8b \n" "uqrshrn v2.8b, v4.8h, #2 \n" diff --git a/source/scale_uv.cc b/source/scale_uv.cc index ab58966d5..003ad2a17 100644 --- a/source/scale_uv.cc +++ b/source/scale_uv.cc @@ -690,8 +690,7 @@ void ScaleUVLinearUp2(int src_width, #endif if (dst_height == 1) { - ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, - dst_width); + ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, dst_width); } else { dy = FixedDiv(src_height - 1, dst_height - 1); y = (1 << 15) - 1; diff --git a/unit_test/color_test.cc b/unit_test/color_test.cc index 60bdfdd68..a81ab19a8 100644 --- a/unit_test/color_test.cc +++ b/unit_test/color_test.cc @@ -470,7 +470,7 @@ static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) { // BT.2020 full range YUV to RGB reference static void YUVVToRGBReference(int y, int u, int v, int* r, int* g, int* b) { - *r = RoundToByte(y + (v - 128) * 1.474600); + *r = RoundToByte(y + (v - 128) * 1.474600); *g = RoundToByte(y - (u - 128) * 0.164553 - (v - 128) * 0.571353); *b = RoundToByte(y + (u - 128) * 1.881400); } @@ -609,9 +609,15 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) { // BT.601 limited range. TEST_F(LibYUVColorTest, TestFullYUV) { - int rh[256] = { 0, }; - int gh[256] = { 0, }; - int bh[256] = { 0, }; + int rh[256] = { + 0, + }; + int gh[256] = { + 0, + }; + int bh[256] = { + 0, + }; for (int u = 0; u < 256; ++u) { for (int v = 0; v < 256; ++v) { for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { @@ -633,9 +639,15 @@ TEST_F(LibYUVColorTest, TestFullYUV) { // BT.601 full range. TEST_F(LibYUVColorTest, TestFullYUVJ) { - int rh[256] = { 0, }; - int gh[256] = { 0, }; - int bh[256] = { 0, }; + int rh[256] = { + 0, + }; + int gh[256] = { + 0, + }; + int bh[256] = { + 0, + }; for (int u = 0; u < 256; ++u) { for (int v = 0; v < 256; ++v) { for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { @@ -657,9 +669,15 @@ TEST_F(LibYUVColorTest, TestFullYUVJ) { // BT.709 limited range. TEST_F(LibYUVColorTest, TestFullYUVH) { - int rh[256] = { 0, }; - int gh[256] = { 0, }; - int bh[256] = { 0, }; + int rh[256] = { + 0, + }; + int gh[256] = { + 0, + }; + int bh[256] = { + 0, + }; for (int u = 0; u < 256; ++u) { for (int v = 0; v < 256; ++v) { for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { @@ -682,9 +700,15 @@ TEST_F(LibYUVColorTest, TestFullYUVH) { // BT.709 full range. TEST_F(LibYUVColorTest, TestFullYUVF) { - int rh[256] = { 0, }; - int gh[256] = { 0, }; - int bh[256] = { 0, }; + int rh[256] = { + 0, + }; + int gh[256] = { + 0, + }; + int bh[256] = { + 0, + }; for (int u = 0; u < 256; ++u) { for (int v = 0; v < 256; ++v) { for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { @@ -706,9 +730,15 @@ TEST_F(LibYUVColorTest, TestFullYUVF) { // BT.2020 limited range. TEST_F(LibYUVColorTest, TestFullYUVU) { - int rh[256] = { 0, }; - int gh[256] = { 0, }; - int bh[256] = { 0, }; + int rh[256] = { + 0, + }; + int gh[256] = { + 0, + }; + int bh[256] = { + 0, + }; for (int u = 0; u < 256; ++u) { for (int v = 0; v < 256; ++v) { for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { @@ -731,9 +761,15 @@ TEST_F(LibYUVColorTest, TestFullYUVU) { // BT.2020 full range. TEST_F(LibYUVColorTest, TestFullYUVV) { - int rh[256] = { 0, }; - int gh[256] = { 0, }; - int bh[256] = { 0, }; + int rh[256] = { + 0, + }; + int gh[256] = { + 0, + }; + int bh[256] = { + 0, + }; for (int u = 0; u < 256; ++u) { for (int v = 0; v < 256; ++v) { for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index c7c5daffe..18b910e58 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -794,10 +794,10 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1) #define U420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ l, m) -#define V420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ +#define V420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ l, m) -#define V420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ +#define V420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ l, m) #define J422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ @@ -824,10 +824,10 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1) #define U422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ l, m) -#define V422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ +#define V422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ l, m) -#define V422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ +#define V422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ l, m) #define J444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ @@ -854,10 +854,10 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1) #define U444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ l, m) -#define V444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ +#define V444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ l, m) -#define V444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ +#define V444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ l, m) diff --git a/util/color.cc b/util/color.cc index 2333276ba..8c3bbefd2 100644 --- a/util/color.cc +++ b/util/color.cc @@ -18,11 +18,15 @@ // For those MCs that can be represented as kr and kb: // Full range -// float M[3][3] {{1,0,2*(1-kr)},{1,-((2*kb)/((2-kb)*(1-kb-kr))),-((2*kr)/((2-kr)*(1-kb-kr)))},{1,2*(1-kb),0}}; -// float B[3] {1+(256*(1-kr))/255,1-(256*kb)/(255*(2-kb)*(1-kb-kr))-(256*kr)/(255*(2-kr)*(1-kb-kr)),1+(256*(1-kb))/255}; +// float M[3][3] +// {{1,0,2*(1-kr)},{1,-((2*kb)/((2-kb)*(1-kb-kr))),-((2*kr)/((2-kr)*(1-kb-kr)))},{1,2*(1-kb),0}}; +// float B[3] +// {1+(256*(1-kr))/255,1-(256*kb)/(255*(2-kb)*(1-kb-kr))-(256*kr)/(255*(2-kr)*(1-kb-kr)),1+(256*(1-kb))/255}; // Limited range -// float M[3][3] {{85/73,0,255/112-(255*kr)/112},{85/73,-((255*kb)/(112*(2-kb)*(1-kb-kr))),-((255*kr)/(112*(2-kr)*(1-kb-kr)))},{85/73,255/112-(255*kb)/112,0}}; -// float B[3] {77662/43435-(1537*kr)/1785,203/219-(1537*kb)/(1785*(2-kb)*(1-kb-kr))-(1537*kr)/(1785*(2-kr)*(1-kb-kr)),77662/43435-(1537*kb)/1785}; +// float M[3][3] +// {{85/73,0,255/112-(255*kr)/112},{85/73,-((255*kb)/(112*(2-kb)*(1-kb-kr))),-((255*kr)/(112*(2-kr)*(1-kb-kr)))},{85/73,255/112-(255*kb)/112,0}}; +// float B[3] +// {77662/43435-(1537*kr)/1785,203/219-(1537*kb)/(1785*(2-kb)*(1-kb-kr))-(1537*kr)/(1785*(2-kr)*(1-kb-kr)),77662/43435-(1537*kb)/1785}; // mc bt // 1 bt.709 KR = 0.2126; KB = 0.0722 @@ -56,11 +60,10 @@ // #define BR (-VR * 128 + YB) int round(float v) { - return (int) (v + 0.5); + return (int)(v + 0.5); } int main(int argc, const char* argv[]) { - if (argc < 2) { printf("color kr kb\n"); return -1; @@ -81,11 +84,11 @@ int main(int argc, const char* argv[]) { printf("KR = %4f; ", kr); printf("KB = %4f\n", kb); -// printf("KG = %4f\n", kg); -// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ -// #define YB 32 /* 64 / 2 */ -// -// // U and V contributions to R,G,B. + // printf("KG = %4f\n", kg); + // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ + // #define YB 32 /* 64 / 2 */ + // + // // U and V contributions to R,G,B. printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub); printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug); @@ -102,11 +105,11 @@ int main(int argc, const char* argv[]) { printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg); printf("B = (Y - 16) * 1.164 + U * %5f\n", ub); -// printf("KG = %4f\n", kg); -// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ -// #define YB 32 /* 64 / 2 */ -// -// // U and V contributions to R,G,B. + // printf("KG = %4f\n", kg); + // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ + // #define YB 32 /* 64 / 2 */ + // + // // U and V contributions to R,G,B. printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub); printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug); @@ -115,4 +118,3 @@ int main(int argc, const char* argv[]) { return 0; } -