mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-01-01 03:12:16 +08:00
Merge/SplitRGB fix -mcmodel=large x86 and InterpolateRow_16To8_NEON
MergeRGB and SplitRGB use a register to point to 9 shuffle tables. - fixes an out of registers error with -mcmodel=large InterpolateRow_16To8_NEON improves performance for I210ToI420: On Pixel 4 for 720p x1000 images Was I210ToI420_Opt (608 ms) Now I210ToI420_Opt (336 ms) On Skylake Xeon Was I210ToI420_Opt (259 ms) Now I210ToI420_Opt (209 ms) Bug: libyuv:931, libyuv:930 Change-Id: I20f8244803f06da511299bf1a2ffc7945eb35221 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3717054 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
parent
fe4a50df8e
commit
6900494d90
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1831
|
||||
Version: 1832
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -348,6 +348,7 @@ extern "C" {
|
||||
#define HAS_AR64TOARGBROW_AVX2
|
||||
#define HAS_AB64TOARGBROW_AVX2
|
||||
#define HAS_CONVERT16TO8ROW_AVX2
|
||||
#define HAS_INTERPOLATEROW_16TO8_AVX2
|
||||
#define HAS_CONVERT8TO16ROW_AVX2
|
||||
#define HAS_DIVIDEROW_16_AVX2
|
||||
#define HAS_HALFMERGEUVROW_AVX2
|
||||
@ -539,6 +540,7 @@ extern "C" {
|
||||
|
||||
// The following are available on AArch64 platforms:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
#define HAS_INTERPOLATEROW_16TO8_NEON
|
||||
#define HAS_SCALESUMSAMPLES_NEON
|
||||
#define HAS_GAUSSROW_F32_NEON
|
||||
#define HAS_GAUSSCOL_F32_NEON
|
||||
@ -5221,6 +5223,30 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr,
|
||||
int scale,
|
||||
int width,
|
||||
int source_y_fraction);
|
||||
void InterpolateRow_16To8_NEON(uint8_t* dst_ptr,
|
||||
const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int scale,
|
||||
int width,
|
||||
int source_y_fraction);
|
||||
void InterpolateRow_16To8_Any_NEON(uint8_t* dst_ptr,
|
||||
const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int scale,
|
||||
int width,
|
||||
int source_y_fraction);
|
||||
void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
|
||||
const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int scale,
|
||||
int width,
|
||||
int source_y_fraction);
|
||||
void InterpolateRow_16To8_Any_AVX2(uint8_t* dst_ptr,
|
||||
const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int scale,
|
||||
int width,
|
||||
int source_y_fraction);
|
||||
|
||||
// Sobel images.
|
||||
void SobelXRow_C(const uint8_t* src_y0,
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1831
|
||||
#define LIBYUV_VERSION 1832
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -1625,46 +1625,100 @@ ANY11C(UYVYToARGBRow_Any_LSX, UYVYToARGBRow_LSX, 1, 4, 4, 7)
|
||||
#undef ANY11C
|
||||
|
||||
// Any 1 to 1 interpolate. Takes 2 rows of source via stride.
|
||||
#define ANY11I(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
|
||||
void NAMEANY(T* dst_ptr, const T* src_ptr, ptrdiff_t src_stride, int width, \
|
||||
int source_y_fraction) { \
|
||||
SIMD_ALIGNED(T temp[64 * 3]); \
|
||||
memset(temp, 0, 64 * 2 * sizeof(T)); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction); \
|
||||
} \
|
||||
memcpy(temp, src_ptr + n * SBPP, r * SBPP * sizeof(T)); \
|
||||
if (source_y_fraction) { \
|
||||
memcpy(temp + 64, src_ptr + src_stride + n * SBPP, \
|
||||
r * SBPP * sizeof(T)); \
|
||||
} \
|
||||
ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \
|
||||
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP * sizeof(T)); \
|
||||
#define ANY11I(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK) \
|
||||
void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \
|
||||
int width, int source_y_fraction) { \
|
||||
SIMD_ALIGNED(TS temps[64 * 2]); \
|
||||
SIMD_ALIGNED(TD tempd[64]); \
|
||||
memset(temps, 0, sizeof(temps)); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction); \
|
||||
} \
|
||||
memcpy(temps, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \
|
||||
if (source_y_fraction) { \
|
||||
memcpy(temps + 64, src_ptr + src_stride + n * SBPP, \
|
||||
r * SBPP * sizeof(TS)); \
|
||||
} \
|
||||
ANY_SIMD(tempd, temps, 64, MASK + 1, source_y_fraction); \
|
||||
memcpy(dst_ptr + n * BPP, tempd, r * BPP * sizeof(TD)); \
|
||||
}
|
||||
|
||||
#ifdef HAS_INTERPOLATEROW_AVX2
|
||||
ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, 1, 1, 31)
|
||||
ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, uint8_t, 1, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_INTERPOLATEROW_SSSE3
|
||||
ANY11I(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, uint8_t, 1, 1, 15)
|
||||
ANY11I(InterpolateRow_Any_SSSE3,
|
||||
InterpolateRow_SSSE3,
|
||||
uint8_t,
|
||||
uint8_t,
|
||||
1,
|
||||
1,
|
||||
15)
|
||||
#endif
|
||||
#ifdef HAS_INTERPOLATEROW_NEON
|
||||
ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, uint8_t, 1, 1, 15)
|
||||
ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, uint8_t, uint8_t, 1, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_INTERPOLATEROW_MSA
|
||||
ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, uint8_t, 1, 1, 31)
|
||||
ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, uint8_t, uint8_t, 1, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_INTERPOLATEROW_LSX
|
||||
ANY11I(InterpolateRow_Any_LSX, InterpolateRow_LSX, uint8_t, 1, 1, 31)
|
||||
ANY11I(InterpolateRow_Any_LSX, InterpolateRow_LSX, uint8_t, uint8_t, 1, 1, 31)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_INTERPOLATEROW_16_NEON
|
||||
ANY11I(InterpolateRow_16_Any_NEON, InterpolateRow_16_NEON, uint16_t, 1, 1, 7)
|
||||
ANY11I(InterpolateRow_16_Any_NEON,
|
||||
InterpolateRow_16_NEON,
|
||||
uint16_t,
|
||||
uint16_t,
|
||||
1,
|
||||
1,
|
||||
7)
|
||||
#endif
|
||||
#undef ANY11I
|
||||
|
||||
// Any 1 to 1 interpolate with scale param
|
||||
#define ANY11IS(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK) \
|
||||
void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \
|
||||
int scale, int width, int source_y_fraction) { \
|
||||
SIMD_ALIGNED(TS temps[64 * 2]); \
|
||||
SIMD_ALIGNED(TD tempd[64]); \
|
||||
memset(temps, 0, sizeof(temps)); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(dst_ptr, src_ptr, src_stride, scale, n, source_y_fraction); \
|
||||
} \
|
||||
memcpy(temps, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \
|
||||
if (source_y_fraction) { \
|
||||
memcpy(temps + 64, src_ptr + src_stride + n * SBPP, \
|
||||
r * SBPP * sizeof(TS)); \
|
||||
} \
|
||||
ANY_SIMD(tempd, temps, 64, scale, MASK + 1, source_y_fraction); \
|
||||
memcpy(dst_ptr + n * BPP, tempd, r * BPP * sizeof(TD)); \
|
||||
}
|
||||
|
||||
#ifdef HAS_INTERPOLATEROW_16TO8_NEON
|
||||
ANY11IS(InterpolateRow_16To8_Any_NEON,
|
||||
InterpolateRow_16To8_NEON,
|
||||
uint8_t,
|
||||
uint16_t,
|
||||
1,
|
||||
1,
|
||||
7)
|
||||
#endif
|
||||
#ifdef HAS_INTERPOLATEROW_16TO8_AVX2
|
||||
ANY11IS(InterpolateRow_16To8_Any_AVX2,
|
||||
InterpolateRow_16To8_AVX2,
|
||||
uint8_t,
|
||||
uint16_t,
|
||||
1,
|
||||
1,
|
||||
31)
|
||||
#endif
|
||||
|
||||
#undef ANY11I
|
||||
#undef ANY11IS
|
||||
|
||||
// Any 1 to 1 mirror.
|
||||
#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \
|
||||
|
||||
@ -2985,6 +2985,9 @@ void DivideRow_16_C(const uint16_t* src_y,
|
||||
// 16384 = 10 bits
|
||||
// 4096 = 12 bits
|
||||
// 256 = 16 bits
|
||||
// TODO(fbarchard): change scale to bits
|
||||
#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16)
|
||||
|
||||
void Convert16To8Row_C(const uint16_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
@ -2994,7 +2997,7 @@ void Convert16To8Row_C(const uint16_t* src_y,
|
||||
assert(scale <= 32768);
|
||||
|
||||
for (x = 0; x < width; ++x) {
|
||||
dst_y[x] = clamp255((src_y[x] * scale) >> 16);
|
||||
dst_y[x] = C16TO8(src_y[x], scale);
|
||||
}
|
||||
}
|
||||
|
||||
@ -3411,8 +3414,7 @@ static void HalfRow_16To8_C(const uint16_t* src_uv,
|
||||
int width) {
|
||||
int x;
|
||||
for (x = 0; x < width; ++x) {
|
||||
dst_uv[x] = clamp255(
|
||||
(((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1) * scale) >> 16);
|
||||
dst_uv[x] = C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale);
|
||||
}
|
||||
}
|
||||
|
||||
@ -3426,6 +3428,9 @@ void InterpolateRow_C(uint8_t* dst_ptr,
|
||||
int y0_fraction = 256 - y1_fraction;
|
||||
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
||||
int x;
|
||||
assert(source_y_fraction >= 0);
|
||||
assert(source_y_fraction < 256);
|
||||
|
||||
if (y1_fraction == 0) {
|
||||
memcpy(dst_ptr, src_ptr, width);
|
||||
return;
|
||||
@ -3434,18 +3439,42 @@ void InterpolateRow_C(uint8_t* dst_ptr,
|
||||
HalfRow_C(src_ptr, src_stride, dst_ptr, width);
|
||||
return;
|
||||
}
|
||||
for (x = 0; x < width - 1; x += 2) {
|
||||
for (x = 0; x < width; ++x) {
|
||||
dst_ptr[0] =
|
||||
(src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
|
||||
dst_ptr[1] =
|
||||
(src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
|
||||
src_ptr += 2;
|
||||
src_ptr1 += 2;
|
||||
dst_ptr += 2;
|
||||
++src_ptr;
|
||||
++src_ptr1;
|
||||
++dst_ptr;
|
||||
}
|
||||
if (width & 1) {
|
||||
}
|
||||
|
||||
// C version 2x2 -> 2x1.
|
||||
void InterpolateRow_16_C(uint16_t* dst_ptr,
|
||||
const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int width,
|
||||
int source_y_fraction) {
|
||||
int y1_fraction = source_y_fraction;
|
||||
int y0_fraction = 256 - y1_fraction;
|
||||
const uint16_t* src_ptr1 = src_ptr + src_stride;
|
||||
int x;
|
||||
assert(source_y_fraction >= 0);
|
||||
assert(source_y_fraction < 256);
|
||||
|
||||
if (y1_fraction == 0) {
|
||||
memcpy(dst_ptr, src_ptr, width * 2);
|
||||
return;
|
||||
}
|
||||
if (y1_fraction == 128) {
|
||||
HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
|
||||
return;
|
||||
}
|
||||
for (x = 0; x < width; ++x) {
|
||||
dst_ptr[0] =
|
||||
(src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
|
||||
++src_ptr;
|
||||
++src_ptr1;
|
||||
++dst_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
@ -3455,6 +3484,8 @@ void InterpolateRow_C(uint8_t* dst_ptr,
|
||||
// 16384 = 10 bits
|
||||
// 4096 = 12 bits
|
||||
// 256 = 16 bits
|
||||
// TODO(fbarchard): change scale to bits
|
||||
|
||||
void InterpolateRow_16To8_C(uint8_t* dst_ptr,
|
||||
const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
@ -3465,6 +3496,9 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr,
|
||||
int y0_fraction = 256 - y1_fraction;
|
||||
const uint16_t* src_ptr1 = src_ptr + src_stride;
|
||||
int x;
|
||||
assert(source_y_fraction >= 0);
|
||||
assert(source_y_fraction < 256);
|
||||
|
||||
if (source_y_fraction == 0) {
|
||||
Convert16To8Row_C(src_ptr, dst_ptr, scale, width);
|
||||
return;
|
||||
@ -3473,53 +3507,13 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr,
|
||||
HalfRow_16To8_C(src_ptr, src_stride, dst_ptr, scale, width);
|
||||
return;
|
||||
}
|
||||
for (x = 0; x < width - 1; x += 2) {
|
||||
dst_ptr[0] = clamp255(
|
||||
(((src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8) *
|
||||
scale) >>
|
||||
16);
|
||||
dst_ptr[1] = clamp255(
|
||||
(((src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8) *
|
||||
scale) >>
|
||||
16);
|
||||
src_ptr += 2;
|
||||
src_ptr1 += 2;
|
||||
dst_ptr += 2;
|
||||
}
|
||||
if (width & 1) {
|
||||
dst_ptr[0] = clamp255(
|
||||
(((src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8) *
|
||||
scale) >>
|
||||
16);
|
||||
}
|
||||
}
|
||||
|
||||
void InterpolateRow_16_C(uint16_t* dst_ptr,
|
||||
const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int width,
|
||||
int source_y_fraction) {
|
||||
int y1_fraction = source_y_fraction;
|
||||
int y0_fraction = 256 - y1_fraction;
|
||||
const uint16_t* src_ptr1 = src_ptr + src_stride;
|
||||
int x;
|
||||
if (source_y_fraction == 0) {
|
||||
memcpy(dst_ptr, src_ptr, width * 2);
|
||||
return;
|
||||
}
|
||||
if (source_y_fraction == 128) {
|
||||
HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
|
||||
return;
|
||||
}
|
||||
for (x = 0; x < width - 1; x += 2) {
|
||||
dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
|
||||
dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
|
||||
src_ptr += 2;
|
||||
src_ptr1 += 2;
|
||||
dst_ptr += 2;
|
||||
}
|
||||
if (width & 1) {
|
||||
dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
|
||||
for (x = 0; x < width; ++x) {
|
||||
dst_ptr[0] = C16TO8(
|
||||
(src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8,
|
||||
scale);
|
||||
src_ptr += 1;
|
||||
src_ptr1 += 1;
|
||||
dst_ptr += 1;
|
||||
}
|
||||
}
|
||||
|
||||
@ -4124,6 +4118,26 @@ void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
|
||||
}
|
||||
#endif // HAS_RAWTOYJROW_SSSE3
|
||||
|
||||
#ifdef HAS_INTERPOLATEROW_16TO8_AVX2
|
||||
void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
|
||||
const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int scale,
|
||||
int width,
|
||||
int source_y_fraction) {
|
||||
// Row buffer for intermediate 16 bit pixels.
|
||||
SIMD_ALIGNED(uint16_t row[MAXTWIDTH]);
|
||||
while (width > 0) {
|
||||
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
|
||||
InterpolateRow_16_C(row, src_ptr, src_stride, twidth, source_y_fraction);
|
||||
Convert16To8Row_AVX2(row, dst_ptr, scale, twidth);
|
||||
src_ptr += twidth;
|
||||
dst_ptr += twidth;
|
||||
width -= twidth;
|
||||
}
|
||||
}
|
||||
#endif // HAS_INTERPOLATEROW_16TO8_AVX2
|
||||
|
||||
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
|
||||
float fsum = 0.f;
|
||||
int i;
|
||||
|
||||
@ -5198,37 +5198,26 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
|
||||
#endif // HAS_CONVERT8TO16ROW_AVX2
|
||||
|
||||
#ifdef HAS_SPLITRGBROW_SSSE3
|
||||
|
||||
// Shuffle table for converting RGB to Planar.
|
||||
static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u,
|
||||
128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u};
|
||||
static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
|
||||
2u, 5u, 8u, 11u, 14u, 128u,
|
||||
128u, 128u, 128u, 128u};
|
||||
static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u, 128u, 1u,
|
||||
4u, 7u, 10u, 13u};
|
||||
|
||||
static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u,
|
||||
128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u};
|
||||
static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
|
||||
3u, 6u, 9u, 12u, 15u, 128u,
|
||||
128u, 128u, 128u, 128u};
|
||||
static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u, 128u, 2u,
|
||||
5u, 8u, 11u, 14u};
|
||||
|
||||
static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u,
|
||||
128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u};
|
||||
static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
|
||||
4u, 7u, 10u, 13u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u};
|
||||
static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u, 0u, 3u,
|
||||
6u, 9u, 12u, 15u};
|
||||
static const uvec8 kSplitRGBShuffle[9] = {
|
||||
{0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u},
|
||||
{128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u,
|
||||
128u, 128u},
|
||||
{128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 1u, 4u,
|
||||
7u, 10u, 13u},
|
||||
{1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u},
|
||||
{128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u,
|
||||
128u, 128u},
|
||||
{128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u,
|
||||
8u, 11u, 14u},
|
||||
{2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u},
|
||||
{128u, 128u, 128u, 128u, 128u, 1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u},
|
||||
{128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u,
|
||||
12u, 15u}};
|
||||
|
||||
void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
|
||||
uint8_t* dst_r,
|
||||
@ -5242,9 +5231,9 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu 0x10(%0),%%xmm1 \n"
|
||||
"movdqu 0x20(%0),%%xmm2 \n"
|
||||
"pshufb %5, %%xmm0 \n"
|
||||
"pshufb %6, %%xmm1 \n"
|
||||
"pshufb %7, %%xmm2 \n"
|
||||
"pshufb 0(%5), %%xmm0 \n"
|
||||
"pshufb 16(%5), %%xmm1 \n"
|
||||
"pshufb 32(%5), %%xmm2 \n"
|
||||
"por %%xmm1,%%xmm0 \n"
|
||||
"por %%xmm2,%%xmm0 \n"
|
||||
"movdqu %%xmm0,(%1) \n"
|
||||
@ -5253,9 +5242,9 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu 0x10(%0),%%xmm1 \n"
|
||||
"movdqu 0x20(%0),%%xmm2 \n"
|
||||
"pshufb %8, %%xmm0 \n"
|
||||
"pshufb %9, %%xmm1 \n"
|
||||
"pshufb %10, %%xmm2 \n"
|
||||
"pshufb 48(%5),%%xmm0 \n"
|
||||
"pshufb 64(%5),%%xmm1 \n"
|
||||
"pshufb 80(%5), %%xmm2 \n"
|
||||
"por %%xmm1,%%xmm0 \n"
|
||||
"por %%xmm2,%%xmm0 \n"
|
||||
"movdqu %%xmm0,(%2) \n"
|
||||
@ -5264,9 +5253,9 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu 0x10(%0),%%xmm1 \n"
|
||||
"movdqu 0x20(%0),%%xmm2 \n"
|
||||
"pshufb %11, %%xmm0 \n"
|
||||
"pshufb %12, %%xmm1 \n"
|
||||
"pshufb %13, %%xmm2 \n"
|
||||
"pshufb 96(%5), %%xmm0 \n"
|
||||
"pshufb 112(%5), %%xmm1 \n"
|
||||
"pshufb 128(%5), %%xmm2 \n"
|
||||
"por %%xmm1,%%xmm0 \n"
|
||||
"por %%xmm2,%%xmm0 \n"
|
||||
"movdqu %%xmm0,(%3) \n"
|
||||
@ -5279,51 +5268,32 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
|
||||
"+r"(dst_g), // %2
|
||||
"+r"(dst_b), // %3
|
||||
"+r"(width) // %4
|
||||
: "m"(kShuffleMaskRGBToR0), // %5
|
||||
"m"(kShuffleMaskRGBToR1), // %6
|
||||
"m"(kShuffleMaskRGBToR2), // %7
|
||||
"m"(kShuffleMaskRGBToG0), // %8
|
||||
"m"(kShuffleMaskRGBToG1), // %9
|
||||
"m"(kShuffleMaskRGBToG2), // %10
|
||||
"m"(kShuffleMaskRGBToB0), // %11
|
||||
"m"(kShuffleMaskRGBToB1), // %12
|
||||
"m"(kShuffleMaskRGBToB2) // %13
|
||||
: "r"(&kSplitRGBShuffle[0]) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
||||
}
|
||||
#endif // HAS_SPLITRGBROW_SSSE3
|
||||
|
||||
#ifdef HAS_MERGERGBROW_SSSE3
|
||||
|
||||
// Shuffle table for converting RGB to Planar.
|
||||
static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
|
||||
2u, 128u, 128u, 3u, 128u, 128u,
|
||||
4u, 128u, 128u, 5u};
|
||||
static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
|
||||
128u, 2u, 128u, 128u, 3u, 128u,
|
||||
128u, 4u, 128u, 128u};
|
||||
static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
|
||||
128u, 128u, 2u, 128u, 128u, 3u,
|
||||
128u, 128u, 4u, 128u};
|
||||
|
||||
static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
|
||||
7u, 128u, 128u, 8u, 128u, 128u,
|
||||
9u, 128u, 128u, 10u};
|
||||
static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
|
||||
128u, 7u, 128u, 128u, 8u, 128u,
|
||||
128u, 9u, 128u, 128u};
|
||||
static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u,
|
||||
128u, 128u, 8u, 128u, 128u, 9u,
|
||||
128u, 128u, 10u, 128u};
|
||||
|
||||
static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
|
||||
12u, 128u, 128u, 13u, 128u, 128u,
|
||||
14u, 128u, 128u, 15u};
|
||||
static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
|
||||
128u, 13u, 128u, 128u, 14u, 128u,
|
||||
128u, 15u, 128u, 128u};
|
||||
static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
|
||||
128u, 128u, 13u, 128u, 128u, 14u,
|
||||
128u, 128u, 15u, 128u};
|
||||
// Shuffle table for converting Planar to RGB.
|
||||
static const uvec8 kMergeRGBShuffle[9] = {
|
||||
{0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, 128u,
|
||||
128u, 5u},
|
||||
{128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u,
|
||||
128u, 128u},
|
||||
{128u, 128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u,
|
||||
4u, 128u},
|
||||
{128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, 128u,
|
||||
10u, 128u},
|
||||
{5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u,
|
||||
128u, 10u},
|
||||
{128u, 5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u,
|
||||
128u, 128u},
|
||||
{128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, 128u,
|
||||
15u, 128u, 128u},
|
||||
{128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u,
|
||||
128u, 15u, 128u},
|
||||
{10u, 128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u,
|
||||
128u, 128u, 15u}};
|
||||
|
||||
void MergeRGBRow_SSSE3(const uint8_t* src_r,
|
||||
const uint8_t* src_g,
|
||||
@ -5337,9 +5307,9 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu (%1),%%xmm1 \n"
|
||||
"movdqu (%2),%%xmm2 \n"
|
||||
"pshufb %5, %%xmm0 \n"
|
||||
"pshufb %6, %%xmm1 \n"
|
||||
"pshufb %7, %%xmm2 \n"
|
||||
"pshufb (%5), %%xmm0 \n"
|
||||
"pshufb 16(%5), %%xmm1 \n"
|
||||
"pshufb 32(%5), %%xmm2 \n"
|
||||
"por %%xmm1,%%xmm0 \n"
|
||||
"por %%xmm2,%%xmm0 \n"
|
||||
"movdqu %%xmm0,(%3) \n"
|
||||
@ -5347,9 +5317,9 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu (%1),%%xmm1 \n"
|
||||
"movdqu (%2),%%xmm2 \n"
|
||||
"pshufb %8, %%xmm0 \n"
|
||||
"pshufb %9, %%xmm1 \n"
|
||||
"pshufb %10, %%xmm2 \n"
|
||||
"pshufb 48(%5), %%xmm0 \n"
|
||||
"pshufb 64(%5), %%xmm1 \n"
|
||||
"pshufb 80(%5), %%xmm2 \n"
|
||||
"por %%xmm1,%%xmm0 \n"
|
||||
"por %%xmm2,%%xmm0 \n"
|
||||
"movdqu %%xmm0,16(%3) \n"
|
||||
@ -5357,9 +5327,9 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu (%1),%%xmm1 \n"
|
||||
"movdqu (%2),%%xmm2 \n"
|
||||
"pshufb %11, %%xmm0 \n"
|
||||
"pshufb %12, %%xmm1 \n"
|
||||
"pshufb %13, %%xmm2 \n"
|
||||
"pshufb 96(%5), %%xmm0 \n"
|
||||
"pshufb 112(%5), %%xmm1 \n"
|
||||
"pshufb 128(%5), %%xmm2 \n"
|
||||
"por %%xmm1,%%xmm0 \n"
|
||||
"por %%xmm2,%%xmm0 \n"
|
||||
"movdqu %%xmm0,32(%3) \n"
|
||||
@ -5375,15 +5345,7 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
|
||||
"+r"(src_b), // %2
|
||||
"+r"(dst_rgb), // %3
|
||||
"+r"(width) // %4
|
||||
: "m"(kShuffleMaskRToRGB0), // %5
|
||||
"m"(kShuffleMaskGToRGB0), // %6
|
||||
"m"(kShuffleMaskBToRGB0), // %7
|
||||
"m"(kShuffleMaskRToRGB1), // %8
|
||||
"m"(kShuffleMaskGToRGB1), // %9
|
||||
"m"(kShuffleMaskBToRGB1), // %10
|
||||
"m"(kShuffleMaskRToRGB2), // %11
|
||||
"m"(kShuffleMaskGToRGB2), // %12
|
||||
"m"(kShuffleMaskBToRGB2) // %13
|
||||
: "r"(&kMergeRGBShuffle[0]) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
||||
}
|
||||
#endif // HAS_MERGERGBROW_SSSE3
|
||||
|
||||
@ -3031,6 +3031,86 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr,
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
|
||||
}
|
||||
|
||||
// Bilinear filter 8x2 -> 8x1
|
||||
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
||||
// 32768 = 9 bits
|
||||
// 16384 = 10 bits
|
||||
// 4096 = 12 bits
|
||||
// 256 = 16 bits
|
||||
void InterpolateRow_16To8_NEON(uint8_t* dst_ptr,
|
||||
const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int scale,
|
||||
int dst_width,
|
||||
int source_y_fraction) {
|
||||
int y1_fraction = source_y_fraction;
|
||||
int y0_fraction = 256 - y1_fraction;
|
||||
const uint16_t* src_ptr1 = src_ptr + src_stride;
|
||||
int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
|
||||
|
||||
asm volatile(
|
||||
"dup v6.8h, %w6 \n"
|
||||
"cmp %w4, #0 \n"
|
||||
"b.eq 100f \n"
|
||||
"cmp %w4, #128 \n"
|
||||
"b.eq 50f \n"
|
||||
|
||||
"dup v5.8h, %w4 \n"
|
||||
"dup v4.8h, %w5 \n"
|
||||
// General purpose row blend.
|
||||
"1: \n"
|
||||
"ld1 {v0.8h}, [%1], #16 \n"
|
||||
"ld1 {v1.8h}, [%2], #16 \n"
|
||||
"subs %w3, %w3, #8 \n"
|
||||
"umull v2.4s, v0.4h, v4.4h \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"umull2 v3.4s, v0.8h, v4.8h \n"
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"umlal v2.4s, v1.4h, v5.4h \n"
|
||||
"umlal2 v3.4s, v1.8h, v5.8h \n"
|
||||
"rshrn v0.4h, v2.4s, #8 \n"
|
||||
"rshrn2 v0.8h, v3.4s, #8 \n"
|
||||
"ushl v0.8h, v0.8h, v6.8h \n"
|
||||
"uqxtn v0.8b, v0.8h \n"
|
||||
"st1 {v0.8b}, [%0], #8 \n"
|
||||
"b.gt 1b \n"
|
||||
"b 99f \n"
|
||||
|
||||
// Blend 50 / 50.
|
||||
"50: \n"
|
||||
"ld1 {v0.8h}, [%1], #16 \n"
|
||||
"ld1 {v1.8h}, [%2], #16 \n"
|
||||
"subs %w3, %w3, #8 \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"urhadd v0.8h, v0.8h, v1.8h \n"
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"ushl v0.8h, v0.8h, v6.8h \n"
|
||||
"uqxtn v0.8b, v0.8h \n"
|
||||
"st1 {v0.8b}, [%0], #8 \n"
|
||||
"b.gt 50b \n"
|
||||
"b 99f \n"
|
||||
|
||||
// Blend 100 / 0 - Copy row unchanged.
|
||||
"100: \n"
|
||||
"ldr q0, [%1], #16 \n"
|
||||
"ushl v0.8h, v0.8h, v2.8h \n" // shr = v2 is negative
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"uqxtn v0.8b, v0.8h \n"
|
||||
"subs %w3, %w3, #8 \n" // 8 src pixels per loop
|
||||
"str d0, [%0], #8 \n" // store 8 pixels
|
||||
"b.gt 100b \n"
|
||||
|
||||
"99: \n"
|
||||
: "+r"(dst_ptr), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"+r"(src_ptr1), // %2
|
||||
"+r"(dst_width) // %3
|
||||
: "r"(y1_fraction), // %4
|
||||
"r"(y0_fraction), // %5
|
||||
"r"(shift) // %6
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
|
||||
}
|
||||
|
||||
// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
|
||||
void ARGBBlendRow_NEON(const uint8_t* src_argb,
|
||||
const uint8_t* src_argb1,
|
||||
|
||||
@ -1605,6 +1605,12 @@ void ScalePlaneVertical_16(int src_height,
|
||||
}
|
||||
}
|
||||
|
||||
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
||||
// 32768 = 9 bits
|
||||
// 16384 = 10 bits
|
||||
// 4096 = 12 bits
|
||||
// 256 = 16 bits
|
||||
// TODO(fbarchard): change scale to bits
|
||||
void ScalePlaneVertical_16To8(int src_height,
|
||||
int dst_width,
|
||||
int dst_height,
|
||||
@ -1620,7 +1626,7 @@ void ScalePlaneVertical_16To8(int src_height,
|
||||
enum FilterMode filtering) {
|
||||
// TODO(fbarchard): Allow higher wpp.
|
||||
int dst_width_words = dst_width * wpp;
|
||||
// TODO(https://crbug.com/libyuv/931): Add NEON and AVX2 versions.
|
||||
// TODO(https://crbug.com/libyuv/931): Add NEON 32 bit and AVX2 versions.
|
||||
void (*InterpolateRow_16To8)(uint8_t * dst_argb, const uint16_t* src_argb,
|
||||
ptrdiff_t src_stride, int scale, int dst_width,
|
||||
int source_y_fraction) = InterpolateRow_16To8_C;
|
||||
@ -1632,6 +1638,22 @@ void ScalePlaneVertical_16To8(int src_height,
|
||||
assert(dst_height > 0);
|
||||
src_argb += (x >> 16) * wpp;
|
||||
|
||||
#if defined(HAS_INTERPOLATEROW_16TO8_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
InterpolateRow_16To8 = InterpolateRow_16To8_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
InterpolateRow_16To8 = InterpolateRow_16To8_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_16TO8_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
InterpolateRow_16To8 = InterpolateRow_16To8_Any_AVX2;
|
||||
if (IS_ALIGNED(dst_width, 32)) {
|
||||
InterpolateRow_16To8 = InterpolateRow_16To8_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for (j = 0; j < dst_height; ++j) {
|
||||
int yi;
|
||||
int yf;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user