mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Rounding added to scaling upsampler
Bug: libyuv:872, b/178521093 Change-Id: I86749f73f5e55d5fd8b87ea6938084cbacb1cda7 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2686945 Reviewed-by: Wan-Teh Chang <wtc@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
742791f13a
commit
12a4a2372c
@ -15,8 +15,8 @@
|
|||||||
#include "libyuv/planar_functions.h"
|
#include "libyuv/planar_functions.h"
|
||||||
#include "libyuv/rotate.h"
|
#include "libyuv/rotate.h"
|
||||||
#include "libyuv/row.h"
|
#include "libyuv/row.h"
|
||||||
#include "libyuv/scale.h" // For ScalePlane()
|
#include "libyuv/scale.h" // For ScalePlane()
|
||||||
#include "libyuv/scale_uv.h" // For UVScale()
|
#include "libyuv/scale_uv.h" // For UVScale()
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
namespace libyuv {
|
namespace libyuv {
|
||||||
|
|||||||
@ -4374,15 +4374,15 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
|
|||||||
"lea 8(%1),%1 \n"
|
"lea 8(%1),%1 \n"
|
||||||
"subl $0x8,%5 \n"
|
"subl $0x8,%5 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(dst_r), // %1
|
"+r"(dst_r), // %1
|
||||||
"+r"(dst_g), // %2
|
"+r"(dst_g), // %2
|
||||||
"+r"(dst_b), // %3
|
"+r"(dst_b), // %3
|
||||||
"+r"(dst_a), // %4
|
"+r"(dst_a), // %4
|
||||||
#if defined(__i386__)
|
#if defined(__i386__)
|
||||||
"+m"(width) // %5
|
"+m"(width) // %5
|
||||||
#else
|
#else
|
||||||
"+rm"(width) // %5
|
"+rm"(width) // %5
|
||||||
#endif
|
#endif
|
||||||
: "m"(kShuffleMaskARGBSplit) // %6
|
: "m"(kShuffleMaskARGBSplit) // %6
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
||||||
@ -4465,15 +4465,15 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
|
|||||||
"lea 16(%1),%1 \n"
|
"lea 16(%1),%1 \n"
|
||||||
"subl $0x10,%5 \n"
|
"subl $0x10,%5 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(dst_r), // %1
|
"+r"(dst_r), // %1
|
||||||
"+r"(dst_g), // %2
|
"+r"(dst_g), // %2
|
||||||
"+r"(dst_b), // %3
|
"+r"(dst_b), // %3
|
||||||
"+r"(dst_a), // %4
|
"+r"(dst_a), // %4
|
||||||
#if defined(__i386__)
|
#if defined(__i386__)
|
||||||
"+m"(width) // %5
|
"+m"(width) // %5
|
||||||
#else
|
#else
|
||||||
"+rm"(width) // %5
|
"+rm"(width) // %5
|
||||||
#endif
|
#endif
|
||||||
: "m"(kShuffleMaskARGBSplit_AVX2), // %6
|
: "m"(kShuffleMaskARGBSplit_AVX2), // %6
|
||||||
"m"(kShuffleMaskARGBPermute_AVX2) // %7
|
"m"(kShuffleMaskARGBPermute_AVX2) // %7
|
||||||
@ -7186,7 +7186,7 @@ void HalfFloatRow_AVX2(const uint16_t* src,
|
|||||||
#if defined(__x86_64__)
|
#if defined(__x86_64__)
|
||||||
: "x"(scale) // %3
|
: "x"(scale) // %3
|
||||||
#else
|
#else
|
||||||
: "m"(scale) // %3
|
: "m"(scale) // %3
|
||||||
#endif
|
#endif
|
||||||
: "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
|
: "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
|
||||||
}
|
}
|
||||||
@ -7224,7 +7224,7 @@ void HalfFloatRow_F16C(const uint16_t* src,
|
|||||||
#if defined(__x86_64__)
|
#if defined(__x86_64__)
|
||||||
: "x"(scale) // %3
|
: "x"(scale) // %3
|
||||||
#else
|
#else
|
||||||
: "m"(scale) // %3
|
: "m"(scale) // %3
|
||||||
#endif
|
#endif
|
||||||
: "memory", "cc", "xmm2", "xmm3", "xmm4");
|
: "memory", "cc", "xmm2", "xmm3", "xmm4");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -700,28 +700,28 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
|
|||||||
|
|
||||||
// Scale up 2 times using bilinear filter.
|
// Scale up 2 times using bilinear filter.
|
||||||
// This function produces 2 rows at a time.
|
// This function produces 2 rows at a time.
|
||||||
#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \
|
#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \
|
||||||
void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
|
void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
|
||||||
ptrdiff_t dst_stride, int dst_width) { \
|
ptrdiff_t dst_stride, int dst_width) { \
|
||||||
int work_width = (dst_width - 1) & ~1; \
|
int work_width = (dst_width - 1) & ~1; \
|
||||||
int r = work_width & MASK; \
|
int r = work_width & MASK; \
|
||||||
int n = work_width & ~MASK; \
|
int n = work_width & ~MASK; \
|
||||||
const PTYPE* sa = src_ptr; \
|
const PTYPE* sa = src_ptr; \
|
||||||
const PTYPE* sb = src_ptr + src_stride; \
|
const PTYPE* sb = src_ptr + src_stride; \
|
||||||
PTYPE* da = dst_ptr; \
|
PTYPE* da = dst_ptr; \
|
||||||
PTYPE* db = dst_ptr + dst_stride; \
|
PTYPE* db = dst_ptr + dst_stride; \
|
||||||
da[0] = (3 * sa[0] + sb[0]) >> 2; \
|
da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \
|
||||||
db[0] = (sa[0] + 3 * sb[0]) >> 2; \
|
db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \
|
||||||
if (work_width > 0) { \
|
if (work_width > 0) { \
|
||||||
if (n != 0) { \
|
if (n != 0) { \
|
||||||
SIMD(sa, sb - sa, da + 1, db - da, n); \
|
SIMD(sa, sb - sa, da + 1, db - da, n); \
|
||||||
} \
|
} \
|
||||||
C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \
|
C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \
|
||||||
} \
|
} \
|
||||||
da[dst_width - 1] = \
|
da[dst_width - 1] = \
|
||||||
(3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2]) >> 2; \
|
(3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \
|
||||||
db[dst_width - 1] = \
|
db[dst_width - 1] = \
|
||||||
(sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2]) >> 2; \
|
(sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \
|
||||||
}
|
}
|
||||||
|
|
||||||
SU2BLANY(ScaleRowUp2_Bilinear_Any_C,
|
SU2BLANY(ScaleRowUp2_Bilinear_Any_C,
|
||||||
@ -856,10 +856,10 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
|
|||||||
const PTYPE* sb = src_ptr + src_stride; \
|
const PTYPE* sb = src_ptr + src_stride; \
|
||||||
PTYPE* da = dst_ptr; \
|
PTYPE* da = dst_ptr; \
|
||||||
PTYPE* db = dst_ptr + dst_stride; \
|
PTYPE* db = dst_ptr + dst_stride; \
|
||||||
da[0] = (3 * sa[0] + sb[0]) >> 2; \
|
da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \
|
||||||
db[0] = (sa[0] + 3 * sb[0]) >> 2; \
|
db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \
|
||||||
da[1] = (3 * sa[1] + sb[1]) >> 2; \
|
da[1] = (3 * sa[1] + sb[1] + 2) >> 2; \
|
||||||
db[1] = (sa[1] + 3 * sb[1]) >> 2; \
|
db[1] = (sa[1] + 3 * sb[1] + 2) >> 2; \
|
||||||
if (work_width > 0) { \
|
if (work_width > 0) { \
|
||||||
if (n != 0) { \
|
if (n != 0) { \
|
||||||
SIMD(sa, sb - sa, da + 2, db - da, n); \
|
SIMD(sa, sb - sa, da + 2, db - da, n); \
|
||||||
@ -867,13 +867,17 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
|
|||||||
C(sa + n, sb - sa, da + 2 * n + 2, db - da, r); \
|
C(sa + n, sb - sa, da + 2 * n + 2, db - da, r); \
|
||||||
} \
|
} \
|
||||||
da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] + \
|
da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] + \
|
||||||
sb[((dst_width + 1) & ~1) - 2]) >> 2; \
|
sb[((dst_width + 1) & ~1) - 2] + 2) >> \
|
||||||
|
2; \
|
||||||
db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] + \
|
db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] + \
|
||||||
3 * sb[((dst_width + 1) & ~1) - 2]) >> 2; \
|
3 * sb[((dst_width + 1) & ~1) - 2] + 2) >> \
|
||||||
|
2; \
|
||||||
da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] + \
|
da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] + \
|
||||||
sb[((dst_width + 1) & ~1) - 1]) >> 2; \
|
sb[((dst_width + 1) & ~1) - 1] + 2) >> \
|
||||||
|
2; \
|
||||||
db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] + \
|
db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] + \
|
||||||
3 * sb[((dst_width + 1) & ~1) - 1]) >> 2; \
|
3 * sb[((dst_width + 1) & ~1) - 1] + 2) >> \
|
||||||
|
2; \
|
||||||
}
|
}
|
||||||
|
|
||||||
SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C,
|
SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C,
|
||||||
|
|||||||
@ -1232,21 +1232,29 @@ void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
|
|||||||
assert((dst_width % 2 == 0) && (dst_width >= 0));
|
assert((dst_width % 2 == 0) && (dst_width >= 0));
|
||||||
for (x = 0; x < src_width; ++x) {
|
for (x = 0; x < src_width; ++x) {
|
||||||
d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
|
d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
|
||||||
t[2 * x + 2] * 1 + 8) >> 4;
|
t[2 * x + 2] * 1 + 8) >>
|
||||||
|
4;
|
||||||
d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
|
d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
|
||||||
t[2 * x + 3] * 1 + 8) >> 4;
|
t[2 * x + 3] * 1 + 8) >>
|
||||||
|
4;
|
||||||
d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
|
d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
|
||||||
t[2 * x + 2] * 3 + 8) >> 4;
|
t[2 * x + 2] * 3 + 8) >>
|
||||||
|
4;
|
||||||
d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
|
d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
|
||||||
t[2 * x + 3] * 3 + 8) >> 4;
|
t[2 * x + 3] * 3 + 8) >>
|
||||||
|
4;
|
||||||
e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
|
e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
|
||||||
t[2 * x + 2] * 3 + 8) >> 4;
|
t[2 * x + 2] * 3 + 8) >>
|
||||||
|
4;
|
||||||
e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
|
e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
|
||||||
t[2 * x + 3] * 3 + 8) >> 4;
|
t[2 * x + 3] * 3 + 8) >>
|
||||||
|
4;
|
||||||
e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
|
e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
|
||||||
t[2 * x + 2] * 9 + 8) >> 4;
|
t[2 * x + 2] * 9 + 8) >>
|
||||||
|
4;
|
||||||
e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
|
e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
|
||||||
t[2 * x + 3] * 9 + 8) >> 4;
|
t[2 * x + 3] * 9 + 8) >>
|
||||||
|
4;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -196,8 +196,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile(LABELALIGN
|
||||||
LABELALIGN
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vmovdqu (%0),%%ymm0 \n"
|
"vmovdqu (%0),%%ymm0 \n"
|
||||||
"vmovdqu 0x20(%0),%%ymm1 \n"
|
"vmovdqu 0x20(%0),%%ymm1 \n"
|
||||||
@ -211,11 +210,11 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
|
|||||||
"sub $0x20,%2 \n"
|
"sub $0x20,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
"vzeroupper \n"
|
"vzeroupper \n"
|
||||||
: "+r"(src_ptr), // %0
|
: "+r"(src_ptr), // %0
|
||||||
"+r"(dst_ptr), // %1
|
"+r"(dst_ptr), // %1
|
||||||
"+r"(dst_width) // %2
|
"+r"(dst_width) // %2
|
||||||
::"memory",
|
::"memory",
|
||||||
"cc", "xmm0", "xmm1");
|
"cc", "xmm0", "xmm1");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
|
void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
|
||||||
@ -483,8 +482,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
|
|||||||
"m"(kShuf1), // %1
|
"m"(kShuf1), // %1
|
||||||
"m"(kShuf2) // %2
|
"m"(kShuf2) // %2
|
||||||
);
|
);
|
||||||
asm volatile(
|
asm volatile(LABELALIGN
|
||||||
LABELALIGN
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm0 \n"
|
"movdqu (%0),%%xmm0 \n"
|
||||||
"movdqu 0x10(%0),%%xmm2 \n"
|
"movdqu 0x10(%0),%%xmm2 \n"
|
||||||
@ -500,11 +498,11 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
|
|||||||
"lea 0x18(%1),%1 \n"
|
"lea 0x18(%1),%1 \n"
|
||||||
"sub $0x18,%2 \n"
|
"sub $0x18,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_ptr), // %0
|
: "+r"(src_ptr), // %0
|
||||||
"+r"(dst_ptr), // %1
|
"+r"(dst_ptr), // %1
|
||||||
"+r"(dst_width) // %2
|
"+r"(dst_width) // %2
|
||||||
::"memory",
|
::"memory",
|
||||||
"cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
"cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
|
void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
|
||||||
@ -529,8 +527,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
|
|||||||
"m"(kMadd11), // %1
|
"m"(kMadd11), // %1
|
||||||
"m"(kRound34) // %2
|
"m"(kRound34) // %2
|
||||||
);
|
);
|
||||||
asm volatile(
|
asm volatile(LABELALIGN
|
||||||
LABELALIGN
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm6 \n"
|
"movdqu (%0),%%xmm6 \n"
|
||||||
"movdqu 0x00(%0,%3,1),%%xmm7 \n"
|
"movdqu 0x00(%0,%3,1),%%xmm7 \n"
|
||||||
@ -563,13 +560,13 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
|
|||||||
"lea 0x18(%1),%1 \n"
|
"lea 0x18(%1),%1 \n"
|
||||||
"sub $0x18,%2 \n"
|
"sub $0x18,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_ptr), // %0
|
: "+r"(src_ptr), // %0
|
||||||
"+r"(dst_ptr), // %1
|
"+r"(dst_ptr), // %1
|
||||||
"+r"(dst_width) // %2
|
"+r"(dst_width) // %2
|
||||||
: "r"((intptr_t)(src_stride)), // %3
|
: "r"((intptr_t)(src_stride)), // %3
|
||||||
"m"(kMadd21) // %4
|
"m"(kMadd21) // %4
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
|
||||||
"xmm7");
|
"xmm6", "xmm7");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
|
void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
|
||||||
@ -595,8 +592,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
|
|||||||
"m"(kRound34) // %2
|
"m"(kRound34) // %2
|
||||||
);
|
);
|
||||||
|
|
||||||
asm volatile(
|
asm volatile(LABELALIGN
|
||||||
LABELALIGN
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm6 \n"
|
"movdqu (%0),%%xmm6 \n"
|
||||||
"movdqu 0x00(%0,%3,1),%%xmm7 \n"
|
"movdqu 0x00(%0,%3,1),%%xmm7 \n"
|
||||||
@ -632,13 +628,13 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
|
|||||||
"lea 0x18(%1),%1 \n"
|
"lea 0x18(%1),%1 \n"
|
||||||
"sub $0x18,%2 \n"
|
"sub $0x18,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_ptr), // %0
|
: "+r"(src_ptr), // %0
|
||||||
"+r"(dst_ptr), // %1
|
"+r"(dst_ptr), // %1
|
||||||
"+r"(dst_width) // %2
|
"+r"(dst_width) // %2
|
||||||
: "r"((intptr_t)(src_stride)), // %3
|
: "r"((intptr_t)(src_stride)), // %3
|
||||||
"m"(kMadd21) // %4
|
"m"(kMadd21) // %4
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
|
||||||
"xmm7");
|
"xmm6", "xmm7");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
|
void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
|
||||||
@ -687,8 +683,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
|
|||||||
"m"(kShufAb2), // %2
|
"m"(kShufAb2), // %2
|
||||||
"m"(kScaleAb2) // %3
|
"m"(kScaleAb2) // %3
|
||||||
);
|
);
|
||||||
asm volatile(
|
asm volatile(LABELALIGN
|
||||||
LABELALIGN
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm0 \n"
|
"movdqu (%0),%%xmm0 \n"
|
||||||
"movdqu 0x00(%0,%3,1),%%xmm1 \n"
|
"movdqu 0x00(%0,%3,1),%%xmm1 \n"
|
||||||
@ -709,11 +704,12 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
|
|||||||
"lea 0x6(%1),%1 \n"
|
"lea 0x6(%1),%1 \n"
|
||||||
"sub $0x6,%2 \n"
|
"sub $0x6,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_ptr), // %0
|
: "+r"(src_ptr), // %0
|
||||||
"+r"(dst_ptr), // %1
|
"+r"(dst_ptr), // %1
|
||||||
"+r"(dst_width) // %2
|
"+r"(dst_width) // %2
|
||||||
: "r"((intptr_t)(src_stride)) // %3
|
: "r"((intptr_t)(src_stride)) // %3
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
|
||||||
|
"xmm6");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
|
void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
|
||||||
@ -730,8 +726,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
|
|||||||
"m"(kShufAc3), // %1
|
"m"(kShufAc3), // %1
|
||||||
"m"(kScaleAc33) // %2
|
"m"(kScaleAc33) // %2
|
||||||
);
|
);
|
||||||
asm volatile(
|
asm volatile(LABELALIGN
|
||||||
LABELALIGN
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm0 \n"
|
"movdqu (%0),%%xmm0 \n"
|
||||||
"movdqu 0x00(%0,%3,1),%%xmm6 \n"
|
"movdqu 0x00(%0,%3,1),%%xmm6 \n"
|
||||||
@ -771,12 +766,12 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
|
|||||||
"lea 0x6(%1),%1 \n"
|
"lea 0x6(%1),%1 \n"
|
||||||
"sub $0x6,%2 \n"
|
"sub $0x6,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_ptr), // %0
|
: "+r"(src_ptr), // %0
|
||||||
"+r"(dst_ptr), // %1
|
"+r"(dst_ptr), // %1
|
||||||
"+r"(dst_width) // %2
|
"+r"(dst_width) // %2
|
||||||
: "r"((intptr_t)(src_stride)) // %3
|
: "r"((intptr_t)(src_stride)) // %3
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
|
||||||
"xmm7");
|
"xmm6", "xmm7");
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
|
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
|
||||||
@ -1601,11 +1596,10 @@ void ScaleRowUp2_Bilinear_16_AVX2_Full(const uint16_t* src_ptr,
|
|||||||
void ScaleAddRow_SSE2(const uint8_t* src_ptr,
|
void ScaleAddRow_SSE2(const uint8_t* src_ptr,
|
||||||
uint16_t* dst_ptr,
|
uint16_t* dst_ptr,
|
||||||
int src_width) {
|
int src_width) {
|
||||||
asm volatile(
|
asm volatile("pxor %%xmm5,%%xmm5 \n"
|
||||||
"pxor %%xmm5,%%xmm5 \n"
|
|
||||||
|
|
||||||
// 16 pixel loop.
|
// 16 pixel loop.
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm3 \n"
|
"movdqu (%0),%%xmm3 \n"
|
||||||
"lea 0x10(%0),%0 \n" // src_ptr += 16
|
"lea 0x10(%0),%0 \n" // src_ptr += 16
|
||||||
@ -1621,11 +1615,11 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
|
|||||||
"lea 0x20(%1),%1 \n"
|
"lea 0x20(%1),%1 \n"
|
||||||
"sub $0x10,%2 \n"
|
"sub $0x10,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_ptr), // %0
|
: "+r"(src_ptr), // %0
|
||||||
"+r"(dst_ptr), // %1
|
"+r"(dst_ptr), // %1
|
||||||
"+r"(src_width) // %2
|
"+r"(src_width) // %2
|
||||||
:
|
:
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAS_SCALEADDROW_AVX2
|
#ifdef HAS_SCALEADDROW_AVX2
|
||||||
@ -1633,10 +1627,9 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
|
|||||||
void ScaleAddRow_AVX2(const uint8_t* src_ptr,
|
void ScaleAddRow_AVX2(const uint8_t* src_ptr,
|
||||||
uint16_t* dst_ptr,
|
uint16_t* dst_ptr,
|
||||||
int src_width) {
|
int src_width) {
|
||||||
asm volatile(
|
asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
|
||||||
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
|
|
||||||
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vmovdqu (%0),%%ymm3 \n"
|
"vmovdqu (%0),%%ymm3 \n"
|
||||||
"lea 0x20(%0),%0 \n" // src_ptr += 32
|
"lea 0x20(%0),%0 \n" // src_ptr += 32
|
||||||
@ -1651,11 +1644,11 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
|
|||||||
"sub $0x20,%2 \n"
|
"sub $0x20,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
"vzeroupper \n"
|
"vzeroupper \n"
|
||||||
: "+r"(src_ptr), // %0
|
: "+r"(src_ptr), // %0
|
||||||
"+r"(dst_ptr), // %1
|
"+r"(dst_ptr), // %1
|
||||||
"+r"(src_width) // %2
|
"+r"(src_width) // %2
|
||||||
:
|
:
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
|
||||||
}
|
}
|
||||||
#endif // HAS_SCALEADDROW_AVX2
|
#endif // HAS_SCALEADDROW_AVX2
|
||||||
|
|
||||||
@ -1772,8 +1765,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
|
|||||||
int dx) {
|
int dx) {
|
||||||
(void)x;
|
(void)x;
|
||||||
(void)dx;
|
(void)dx;
|
||||||
asm volatile(
|
asm volatile(LABELALIGN
|
||||||
LABELALIGN
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%1),%%xmm0 \n"
|
"movdqu (%1),%%xmm0 \n"
|
||||||
"lea 0x10(%1),%1 \n"
|
"lea 0x10(%1),%1 \n"
|
||||||
@ -1786,11 +1778,11 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
|
|||||||
"sub $0x20,%2 \n"
|
"sub $0x20,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
|
|
||||||
: "+r"(dst_ptr), // %0
|
: "+r"(dst_ptr), // %0
|
||||||
"+r"(src_ptr), // %1
|
"+r"(src_ptr), // %1
|
||||||
"+r"(dst_width) // %2
|
"+r"(dst_width) // %2
|
||||||
::"memory",
|
::"memory",
|
||||||
"cc", "xmm0", "xmm1");
|
"cc", "xmm0", "xmm1");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
|
void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
|
||||||
@ -1798,8 +1790,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile(LABELALIGN
|
||||||
LABELALIGN
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm0 \n"
|
"movdqu (%0),%%xmm0 \n"
|
||||||
"movdqu 0x10(%0),%%xmm1 \n"
|
"movdqu 0x10(%0),%%xmm1 \n"
|
||||||
@ -1809,11 +1800,11 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
|
|||||||
"lea 0x10(%1),%1 \n"
|
"lea 0x10(%1),%1 \n"
|
||||||
"sub $0x4,%2 \n"
|
"sub $0x4,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(dst_argb), // %1
|
"+r"(dst_argb), // %1
|
||||||
"+r"(dst_width) // %2
|
"+r"(dst_width) // %2
|
||||||
::"memory",
|
::"memory",
|
||||||
"cc", "xmm0", "xmm1");
|
"cc", "xmm0", "xmm1");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
|
void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
|
||||||
@ -1821,8 +1812,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile(LABELALIGN
|
||||||
LABELALIGN
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm0 \n"
|
"movdqu (%0),%%xmm0 \n"
|
||||||
"movdqu 0x10(%0),%%xmm1 \n"
|
"movdqu 0x10(%0),%%xmm1 \n"
|
||||||
@ -1835,19 +1825,18 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
|
|||||||
"lea 0x10(%1),%1 \n"
|
"lea 0x10(%1),%1 \n"
|
||||||
"sub $0x4,%2 \n"
|
"sub $0x4,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(dst_argb), // %1
|
"+r"(dst_argb), // %1
|
||||||
"+r"(dst_width) // %2
|
"+r"(dst_width) // %2
|
||||||
::"memory",
|
::"memory",
|
||||||
"cc", "xmm0", "xmm1");
|
"cc", "xmm0", "xmm1");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
|
void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
|
||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
asm volatile(
|
asm volatile(LABELALIGN
|
||||||
LABELALIGN
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm0 \n"
|
"movdqu (%0),%%xmm0 \n"
|
||||||
"movdqu 0x10(%0),%%xmm1 \n"
|
"movdqu 0x10(%0),%%xmm1 \n"
|
||||||
@ -1864,11 +1853,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
|
|||||||
"lea 0x10(%1),%1 \n"
|
"lea 0x10(%1),%1 \n"
|
||||||
"sub $0x4,%2 \n"
|
"sub $0x4,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(dst_argb), // %1
|
"+r"(dst_argb), // %1
|
||||||
"+r"(dst_width) // %2
|
"+r"(dst_width) // %2
|
||||||
: "r"((intptr_t)(src_stride)) // %3
|
: "r"((intptr_t)(src_stride)) // %3
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reads 4 pixels at a time.
|
// Reads 4 pixels at a time.
|
||||||
@ -2032,8 +2021,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
|
|||||||
int dx) {
|
int dx) {
|
||||||
(void)x;
|
(void)x;
|
||||||
(void)dx;
|
(void)dx;
|
||||||
asm volatile(
|
asm volatile(LABELALIGN
|
||||||
LABELALIGN
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%1),%%xmm0 \n"
|
"movdqu (%1),%%xmm0 \n"
|
||||||
"lea 0x10(%1),%1 \n"
|
"lea 0x10(%1),%1 \n"
|
||||||
@ -2046,11 +2034,11 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
|
|||||||
"sub $0x8,%2 \n"
|
"sub $0x8,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
|
|
||||||
: "+r"(dst_argb), // %0
|
: "+r"(dst_argb), // %0
|
||||||
"+r"(src_argb), // %1
|
"+r"(src_argb), // %1
|
||||||
"+r"(dst_width) // %2
|
"+r"(dst_width) // %2
|
||||||
::"memory",
|
::"memory",
|
||||||
"cc", "xmm0", "xmm1");
|
"cc", "xmm0", "xmm1");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
|
// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
|
||||||
@ -2381,7 +2369,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
|
|||||||
"+r"(dst_width) // %2
|
"+r"(dst_width) // %2
|
||||||
: "r"((intptr_t)(src_stride)), // %3
|
: "r"((intptr_t)(src_stride)), // %3
|
||||||
"r"((intptr_t)(dst_stride)), // %4
|
"r"((intptr_t)(dst_stride)), // %4
|
||||||
"m"(kUVLinearMadd31_SSSE3) // %5
|
"m"(kUVLinearMadd31_SSSE3) // %5
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||||
"xmm7");
|
"xmm7");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -194,21 +194,21 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
|
|||||||
"vmlal.u8 q10, d2, d24 \n"
|
"vmlal.u8 q10, d2, d24 \n"
|
||||||
"vmlal.u8 q11, d3, d24 \n"
|
"vmlal.u8 q11, d3, d24 \n"
|
||||||
|
|
||||||
// (3 * line_0 + line_1) >> 2
|
// (3 * line_0 + line_1 + 2) >> 2
|
||||||
"vqrshrn.u16 d0, q8, #2 \n"
|
"vqrshrn.u16 d0, q8, #2 \n"
|
||||||
"vqrshrn.u16 d1, q9, #2 \n"
|
"vqrshrn.u16 d1, q9, #2 \n"
|
||||||
"vqrshrn.u16 d2, q10, #2 \n"
|
"vqrshrn.u16 d2, q10, #2 \n"
|
||||||
"vqrshrn.u16 d3, q11, #2 \n"
|
"vqrshrn.u16 d3, q11, #2 \n"
|
||||||
|
|
||||||
// a0 = (src[0] * 3 + s[1] * 1) >> 2
|
// a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
|
||||||
"vmovl.u8 q8, d1 \n"
|
"vmovl.u8 q8, d1 \n"
|
||||||
"vmlal.u8 q8, d0, d24 \n"
|
"vmlal.u8 q8, d0, d24 \n"
|
||||||
"vqrshrn.u16 d0, q8, #2 \n"
|
"vqrshrn.u16 d0, q8, #2 \n"
|
||||||
|
|
||||||
// a1 = (src[1] * 1 + s[2] * 1) >> 1
|
// a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
|
||||||
"vrhadd.u8 d1, d1, d2 \n"
|
"vrhadd.u8 d1, d1, d2 \n"
|
||||||
|
|
||||||
// a2 = (src[2] * 1 + s[3] * 3) >> 2
|
// a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
|
||||||
"vmovl.u8 q8, d2 \n"
|
"vmovl.u8 q8, d2 \n"
|
||||||
"vmlal.u8 q8, d3, d24 \n"
|
"vmlal.u8 q8, d3, d24 \n"
|
||||||
"vqrshrn.u16 d2, q8, #2 \n"
|
"vqrshrn.u16 d2, q8, #2 \n"
|
||||||
@ -240,15 +240,15 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
|
|||||||
"vrhadd.u8 q0, q0, q2 \n"
|
"vrhadd.u8 q0, q0, q2 \n"
|
||||||
"vrhadd.u8 q1, q1, q3 \n"
|
"vrhadd.u8 q1, q1, q3 \n"
|
||||||
|
|
||||||
// a0 = (src[0] * 3 + s[1] * 1) >> 2
|
// a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
|
||||||
"vmovl.u8 q3, d1 \n"
|
"vmovl.u8 q3, d1 \n"
|
||||||
"vmlal.u8 q3, d0, d24 \n"
|
"vmlal.u8 q3, d0, d24 \n"
|
||||||
"vqrshrn.u16 d0, q3, #2 \n"
|
"vqrshrn.u16 d0, q3, #2 \n"
|
||||||
|
|
||||||
// a1 = (src[1] * 1 + s[2] * 1) >> 1
|
// a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
|
||||||
"vrhadd.u8 d1, d1, d2 \n"
|
"vrhadd.u8 d1, d1, d2 \n"
|
||||||
|
|
||||||
// a2 = (src[2] * 1 + s[3] * 3) >> 2
|
// a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
|
||||||
"vmovl.u8 q3, d2 \n"
|
"vmovl.u8 q3, d2 \n"
|
||||||
"vmlal.u8 q3, d3, d24 \n"
|
"vmlal.u8 q3, d3, d24 \n"
|
||||||
"vqrshrn.u16 d2, q3, #2 \n"
|
"vqrshrn.u16 d2, q3, #2 \n"
|
||||||
|
|||||||
@ -201,22 +201,22 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
|
|||||||
"umlal v19.8h, v3.8b, v20.8b \n"
|
"umlal v19.8h, v3.8b, v20.8b \n"
|
||||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||||
|
|
||||||
// (3 * line_0 + line_1) >> 2
|
// (3 * line_0 + line_1 + 2) >> 2
|
||||||
"uqrshrn v0.8b, v16.8h, #2 \n"
|
"uqrshrn v0.8b, v16.8h, #2 \n"
|
||||||
"uqrshrn v1.8b, v17.8h, #2 \n"
|
"uqrshrn v1.8b, v17.8h, #2 \n"
|
||||||
"uqrshrn v2.8b, v18.8h, #2 \n"
|
"uqrshrn v2.8b, v18.8h, #2 \n"
|
||||||
"uqrshrn v3.8b, v19.8h, #2 \n"
|
"uqrshrn v3.8b, v19.8h, #2 \n"
|
||||||
"prfm pldl1keep, [%3, 448] \n"
|
"prfm pldl1keep, [%3, 448] \n"
|
||||||
|
|
||||||
// a0 = (src[0] * 3 + s[1] * 1) >> 2
|
// a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
|
||||||
"ushll v16.8h, v1.8b, #0 \n"
|
"ushll v16.8h, v1.8b, #0 \n"
|
||||||
"umlal v16.8h, v0.8b, v20.8b \n"
|
"umlal v16.8h, v0.8b, v20.8b \n"
|
||||||
"uqrshrn v0.8b, v16.8h, #2 \n"
|
"uqrshrn v0.8b, v16.8h, #2 \n"
|
||||||
|
|
||||||
// a1 = (src[1] * 1 + s[2] * 1) >> 1
|
// a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
|
||||||
"urhadd v1.8b, v1.8b, v2.8b \n"
|
"urhadd v1.8b, v1.8b, v2.8b \n"
|
||||||
|
|
||||||
// a2 = (src[2] * 1 + s[3] * 3) >> 2
|
// a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
|
||||||
"ushll v16.8h, v2.8b, #0 \n"
|
"ushll v16.8h, v2.8b, #0 \n"
|
||||||
"umlal v16.8h, v3.8b, v20.8b \n"
|
"umlal v16.8h, v3.8b, v20.8b \n"
|
||||||
"uqrshrn v2.8b, v16.8h, #2 \n"
|
"uqrshrn v2.8b, v16.8h, #2 \n"
|
||||||
@ -251,16 +251,16 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
|
|||||||
"urhadd v3.8b, v3.8b, v7.8b \n"
|
"urhadd v3.8b, v3.8b, v7.8b \n"
|
||||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||||
|
|
||||||
// a0 = (src[0] * 3 + s[1] * 1) >> 2
|
// a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
|
||||||
"ushll v4.8h, v1.8b, #0 \n"
|
"ushll v4.8h, v1.8b, #0 \n"
|
||||||
"umlal v4.8h, v0.8b, v20.8b \n"
|
"umlal v4.8h, v0.8b, v20.8b \n"
|
||||||
"uqrshrn v0.8b, v4.8h, #2 \n"
|
"uqrshrn v0.8b, v4.8h, #2 \n"
|
||||||
"prfm pldl1keep, [%3, 448] \n"
|
"prfm pldl1keep, [%3, 448] \n"
|
||||||
|
|
||||||
// a1 = (src[1] * 1 + s[2] * 1) >> 1
|
// a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
|
||||||
"urhadd v1.8b, v1.8b, v2.8b \n"
|
"urhadd v1.8b, v1.8b, v2.8b \n"
|
||||||
|
|
||||||
// a2 = (src[2] * 1 + s[3] * 3) >> 2
|
// a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
|
||||||
"ushll v4.8h, v2.8b, #0 \n"
|
"ushll v4.8h, v2.8b, #0 \n"
|
||||||
"umlal v4.8h, v3.8b, v20.8b \n"
|
"umlal v4.8h, v3.8b, v20.8b \n"
|
||||||
"uqrshrn v2.8b, v4.8h, #2 \n"
|
"uqrshrn v2.8b, v4.8h, #2 \n"
|
||||||
|
|||||||
@ -690,8 +690,7 @@ void ScaleUVLinearUp2(int src_width,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (dst_height == 1) {
|
if (dst_height == 1) {
|
||||||
ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv,
|
ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, dst_width);
|
||||||
dst_width);
|
|
||||||
} else {
|
} else {
|
||||||
dy = FixedDiv(src_height - 1, dst_height - 1);
|
dy = FixedDiv(src_height - 1, dst_height - 1);
|
||||||
y = (1 << 15) - 1;
|
y = (1 << 15) - 1;
|
||||||
|
|||||||
@ -470,7 +470,7 @@ static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
|
|||||||
|
|
||||||
// BT.2020 full range YUV to RGB reference
|
// BT.2020 full range YUV to RGB reference
|
||||||
static void YUVVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
|
static void YUVVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
|
||||||
*r = RoundToByte(y + (v - 128) * 1.474600);
|
*r = RoundToByte(y + (v - 128) * 1.474600);
|
||||||
*g = RoundToByte(y - (u - 128) * 0.164553 - (v - 128) * 0.571353);
|
*g = RoundToByte(y - (u - 128) * 0.164553 - (v - 128) * 0.571353);
|
||||||
*b = RoundToByte(y + (u - 128) * 1.881400);
|
*b = RoundToByte(y + (u - 128) * 1.881400);
|
||||||
}
|
}
|
||||||
@ -609,9 +609,15 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
|
|||||||
|
|
||||||
// BT.601 limited range.
|
// BT.601 limited range.
|
||||||
TEST_F(LibYUVColorTest, TestFullYUV) {
|
TEST_F(LibYUVColorTest, TestFullYUV) {
|
||||||
int rh[256] = { 0, };
|
int rh[256] = {
|
||||||
int gh[256] = { 0, };
|
0,
|
||||||
int bh[256] = { 0, };
|
};
|
||||||
|
int gh[256] = {
|
||||||
|
0,
|
||||||
|
};
|
||||||
|
int bh[256] = {
|
||||||
|
0,
|
||||||
|
};
|
||||||
for (int u = 0; u < 256; ++u) {
|
for (int u = 0; u < 256; ++u) {
|
||||||
for (int v = 0; v < 256; ++v) {
|
for (int v = 0; v < 256; ++v) {
|
||||||
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
|
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
|
||||||
@ -633,9 +639,15 @@ TEST_F(LibYUVColorTest, TestFullYUV) {
|
|||||||
|
|
||||||
// BT.601 full range.
|
// BT.601 full range.
|
||||||
TEST_F(LibYUVColorTest, TestFullYUVJ) {
|
TEST_F(LibYUVColorTest, TestFullYUVJ) {
|
||||||
int rh[256] = { 0, };
|
int rh[256] = {
|
||||||
int gh[256] = { 0, };
|
0,
|
||||||
int bh[256] = { 0, };
|
};
|
||||||
|
int gh[256] = {
|
||||||
|
0,
|
||||||
|
};
|
||||||
|
int bh[256] = {
|
||||||
|
0,
|
||||||
|
};
|
||||||
for (int u = 0; u < 256; ++u) {
|
for (int u = 0; u < 256; ++u) {
|
||||||
for (int v = 0; v < 256; ++v) {
|
for (int v = 0; v < 256; ++v) {
|
||||||
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
|
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
|
||||||
@ -657,9 +669,15 @@ TEST_F(LibYUVColorTest, TestFullYUVJ) {
|
|||||||
|
|
||||||
// BT.709 limited range.
|
// BT.709 limited range.
|
||||||
TEST_F(LibYUVColorTest, TestFullYUVH) {
|
TEST_F(LibYUVColorTest, TestFullYUVH) {
|
||||||
int rh[256] = { 0, };
|
int rh[256] = {
|
||||||
int gh[256] = { 0, };
|
0,
|
||||||
int bh[256] = { 0, };
|
};
|
||||||
|
int gh[256] = {
|
||||||
|
0,
|
||||||
|
};
|
||||||
|
int bh[256] = {
|
||||||
|
0,
|
||||||
|
};
|
||||||
for (int u = 0; u < 256; ++u) {
|
for (int u = 0; u < 256; ++u) {
|
||||||
for (int v = 0; v < 256; ++v) {
|
for (int v = 0; v < 256; ++v) {
|
||||||
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
|
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
|
||||||
@ -682,9 +700,15 @@ TEST_F(LibYUVColorTest, TestFullYUVH) {
|
|||||||
|
|
||||||
// BT.709 full range.
|
// BT.709 full range.
|
||||||
TEST_F(LibYUVColorTest, TestFullYUVF) {
|
TEST_F(LibYUVColorTest, TestFullYUVF) {
|
||||||
int rh[256] = { 0, };
|
int rh[256] = {
|
||||||
int gh[256] = { 0, };
|
0,
|
||||||
int bh[256] = { 0, };
|
};
|
||||||
|
int gh[256] = {
|
||||||
|
0,
|
||||||
|
};
|
||||||
|
int bh[256] = {
|
||||||
|
0,
|
||||||
|
};
|
||||||
for (int u = 0; u < 256; ++u) {
|
for (int u = 0; u < 256; ++u) {
|
||||||
for (int v = 0; v < 256; ++v) {
|
for (int v = 0; v < 256; ++v) {
|
||||||
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
|
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
|
||||||
@ -706,9 +730,15 @@ TEST_F(LibYUVColorTest, TestFullYUVF) {
|
|||||||
|
|
||||||
// BT.2020 limited range.
|
// BT.2020 limited range.
|
||||||
TEST_F(LibYUVColorTest, TestFullYUVU) {
|
TEST_F(LibYUVColorTest, TestFullYUVU) {
|
||||||
int rh[256] = { 0, };
|
int rh[256] = {
|
||||||
int gh[256] = { 0, };
|
0,
|
||||||
int bh[256] = { 0, };
|
};
|
||||||
|
int gh[256] = {
|
||||||
|
0,
|
||||||
|
};
|
||||||
|
int bh[256] = {
|
||||||
|
0,
|
||||||
|
};
|
||||||
for (int u = 0; u < 256; ++u) {
|
for (int u = 0; u < 256; ++u) {
|
||||||
for (int v = 0; v < 256; ++v) {
|
for (int v = 0; v < 256; ++v) {
|
||||||
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
|
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
|
||||||
@ -731,9 +761,15 @@ TEST_F(LibYUVColorTest, TestFullYUVU) {
|
|||||||
|
|
||||||
// BT.2020 full range.
|
// BT.2020 full range.
|
||||||
TEST_F(LibYUVColorTest, TestFullYUVV) {
|
TEST_F(LibYUVColorTest, TestFullYUVV) {
|
||||||
int rh[256] = { 0, };
|
int rh[256] = {
|
||||||
int gh[256] = { 0, };
|
0,
|
||||||
int bh[256] = { 0, };
|
};
|
||||||
|
int gh[256] = {
|
||||||
|
0,
|
||||||
|
};
|
||||||
|
int bh[256] = {
|
||||||
|
0,
|
||||||
|
};
|
||||||
for (int u = 0; u < 256; ++u) {
|
for (int u = 0; u < 256; ++u) {
|
||||||
for (int v = 0; v < 256; ++v) {
|
for (int v = 0; v < 256; ++v) {
|
||||||
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
|
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
|
||||||
|
|||||||
@ -794,10 +794,10 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
|
|||||||
#define U420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
#define U420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
||||||
I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
|
I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
|
||||||
l, m)
|
l, m)
|
||||||
#define V420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
#define V420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
||||||
I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
|
I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
|
||||||
l, m)
|
l, m)
|
||||||
#define V420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
#define V420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
||||||
I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
|
I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
|
||||||
l, m)
|
l, m)
|
||||||
#define J422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
#define J422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
||||||
@ -824,10 +824,10 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
|
|||||||
#define U422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
#define U422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
||||||
I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
|
I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
|
||||||
l, m)
|
l, m)
|
||||||
#define V422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
#define V422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
||||||
I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
|
I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
|
||||||
l, m)
|
l, m)
|
||||||
#define V422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
#define V422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
||||||
I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
|
I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
|
||||||
l, m)
|
l, m)
|
||||||
#define J444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
#define J444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
||||||
@ -854,10 +854,10 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
|
|||||||
#define U444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
#define U444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
||||||
I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
|
I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
|
||||||
l, m)
|
l, m)
|
||||||
#define V444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
#define V444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
||||||
I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
|
I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
|
||||||
l, m)
|
l, m)
|
||||||
#define V444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
#define V444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
|
||||||
I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
|
I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
|
||||||
l, m)
|
l, m)
|
||||||
|
|
||||||
|
|||||||
@ -18,11 +18,15 @@
|
|||||||
|
|
||||||
// For those MCs that can be represented as kr and kb:
|
// For those MCs that can be represented as kr and kb:
|
||||||
// Full range
|
// Full range
|
||||||
// float M[3][3] {{1,0,2*(1-kr)},{1,-((2*kb)/((2-kb)*(1-kb-kr))),-((2*kr)/((2-kr)*(1-kb-kr)))},{1,2*(1-kb),0}};
|
// float M[3][3]
|
||||||
// float B[3] {1+(256*(1-kr))/255,1-(256*kb)/(255*(2-kb)*(1-kb-kr))-(256*kr)/(255*(2-kr)*(1-kb-kr)),1+(256*(1-kb))/255};
|
// {{1,0,2*(1-kr)},{1,-((2*kb)/((2-kb)*(1-kb-kr))),-((2*kr)/((2-kr)*(1-kb-kr)))},{1,2*(1-kb),0}};
|
||||||
|
// float B[3]
|
||||||
|
// {1+(256*(1-kr))/255,1-(256*kb)/(255*(2-kb)*(1-kb-kr))-(256*kr)/(255*(2-kr)*(1-kb-kr)),1+(256*(1-kb))/255};
|
||||||
// Limited range
|
// Limited range
|
||||||
// float M[3][3] {{85/73,0,255/112-(255*kr)/112},{85/73,-((255*kb)/(112*(2-kb)*(1-kb-kr))),-((255*kr)/(112*(2-kr)*(1-kb-kr)))},{85/73,255/112-(255*kb)/112,0}};
|
// float M[3][3]
|
||||||
// float B[3] {77662/43435-(1537*kr)/1785,203/219-(1537*kb)/(1785*(2-kb)*(1-kb-kr))-(1537*kr)/(1785*(2-kr)*(1-kb-kr)),77662/43435-(1537*kb)/1785};
|
// {{85/73,0,255/112-(255*kr)/112},{85/73,-((255*kb)/(112*(2-kb)*(1-kb-kr))),-((255*kr)/(112*(2-kr)*(1-kb-kr)))},{85/73,255/112-(255*kb)/112,0}};
|
||||||
|
// float B[3]
|
||||||
|
// {77662/43435-(1537*kr)/1785,203/219-(1537*kb)/(1785*(2-kb)*(1-kb-kr))-(1537*kr)/(1785*(2-kr)*(1-kb-kr)),77662/43435-(1537*kb)/1785};
|
||||||
|
|
||||||
// mc bt
|
// mc bt
|
||||||
// 1 bt.709 KR = 0.2126; KB = 0.0722
|
// 1 bt.709 KR = 0.2126; KB = 0.0722
|
||||||
@ -56,11 +60,10 @@
|
|||||||
// #define BR (-VR * 128 + YB)
|
// #define BR (-VR * 128 + YB)
|
||||||
|
|
||||||
int round(float v) {
|
int round(float v) {
|
||||||
return (int) (v + 0.5);
|
return (int)(v + 0.5);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, const char* argv[]) {
|
int main(int argc, const char* argv[]) {
|
||||||
|
|
||||||
if (argc < 2) {
|
if (argc < 2) {
|
||||||
printf("color kr kb\n");
|
printf("color kr kb\n");
|
||||||
return -1;
|
return -1;
|
||||||
@ -81,11 +84,11 @@ int main(int argc, const char* argv[]) {
|
|||||||
|
|
||||||
printf("KR = %4f; ", kr);
|
printf("KR = %4f; ", kr);
|
||||||
printf("KB = %4f\n", kb);
|
printf("KB = %4f\n", kb);
|
||||||
// printf("KG = %4f\n", kg);
|
// printf("KG = %4f\n", kg);
|
||||||
// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
|
// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
|
||||||
// #define YB 32 /* 64 / 2 */
|
// #define YB 32 /* 64 / 2 */
|
||||||
//
|
//
|
||||||
// // U and V contributions to R,G,B.
|
// // U and V contributions to R,G,B.
|
||||||
|
|
||||||
printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
|
printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
|
||||||
printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
|
printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
|
||||||
@ -102,11 +105,11 @@ int main(int argc, const char* argv[]) {
|
|||||||
printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg);
|
printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg);
|
||||||
printf("B = (Y - 16) * 1.164 + U * %5f\n", ub);
|
printf("B = (Y - 16) * 1.164 + U * %5f\n", ub);
|
||||||
|
|
||||||
// printf("KG = %4f\n", kg);
|
// printf("KG = %4f\n", kg);
|
||||||
// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
|
// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
|
||||||
// #define YB 32 /* 64 / 2 */
|
// #define YB 32 /* 64 / 2 */
|
||||||
//
|
//
|
||||||
// // U and V contributions to R,G,B.
|
// // U and V contributions to R,G,B.
|
||||||
|
|
||||||
printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
|
printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
|
||||||
printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
|
printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
|
||||||
@ -115,4 +118,3 @@ int main(int argc, const char* argv[]) {
|
|||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user