MergeUV_AVX512BW for I420ToNV12

On Skylake Xeon 640x360 100000 iterations
AVX512   MergeUVPlane_Opt (1196 ms)
AVX2     MergeUVPlane_Opt (1565 ms)
SSE2     MergeUVPlane_Opt (1780 ms)
Pixel 7  MergeUVPlane_Opt (1177 ms)

Bug: None
Change-Id: If47d4fa957cf27781bba5fd6a2f0bf554101a5c6
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4242247
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
Frank Barchard 2023-02-13 10:52:58 -08:00 committed by libyuv LUCI CQ
parent b2528b0be9
commit 2bdc210be9
17 changed files with 390 additions and 108 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1857 Version: 1860
License: BSD License: BSD
License File: LICENSE License File: LICENSE

View File

@ -232,6 +232,27 @@ void TransposeWx1_16_C(const uint16_t* src,
uint16_t* dst, uint16_t* dst,
int dst_stride, int dst_stride,
int width); int width);
// Transpose 32 bit values (ARGB)
void Transpose4x4_32_NEON(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width);
void Transpose4x4_32_C(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width);
// Transpose 32 bit values (ARGB)
void Transpose8x8_32_NEON(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width);
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv

View File

@ -402,9 +402,11 @@ extern "C" {
// The following are available for AVX512 clang x86 platforms: // The following are available for AVX512 clang x86 platforms:
// TODO(fbarchard): Port to GCC and Visual C // TODO(fbarchard): Port to GCC and Visual C
// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789 // TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
// TODO(fbarchard): Port MERGEUV to assembly
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512)) (defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512) && !defined(_MSC_VER))
#define HAS_ARGBTORGB24ROW_AVX512VBMI #define HAS_ARGBTORGB24ROW_AVX512VBMI
#define HAS_MERGEUVROW_AVX512BW
#endif #endif
// The following are available for AVX512 clang x64 platforms: // The following are available for AVX512 clang x64 platforms:
@ -2184,6 +2186,10 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
const uint8_t* src_v, const uint8_t* src_v,
uint8_t* dst_uv, uint8_t* dst_uv,
int width); int width);
void MergeUVRow_AVX512BW(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width);
void MergeUVRow_NEON(const uint8_t* src_u, void MergeUVRow_NEON(const uint8_t* src_u,
const uint8_t* src_v, const uint8_t* src_v,
uint8_t* dst_uv, uint8_t* dst_uv,
@ -2204,6 +2210,10 @@ void MergeUVRow_Any_AVX2(const uint8_t* y_buf,
const uint8_t* uv_buf, const uint8_t* uv_buf,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int width); int width);
void MergeUVRow_Any_AVX512BW(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
void MergeUVRow_Any_NEON(const uint8_t* y_buf, void MergeUVRow_Any_NEON(const uint8_t* y_buf,
const uint8_t* uv_buf, const uint8_t* uv_buf,
uint8_t* dst_ptr, uint8_t* dst_ptr,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1857 #define LIBYUV_VERSION 1860
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -924,6 +924,14 @@ int I422ToNV21(const uint8_t* src_y,
} }
} }
#endif #endif
#if defined(HAS_MERGEUVROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
MergeUVRow = MergeUVRow_Any_AVX512BW;
if (IS_ALIGNED(halfwidth, 32)) {
MergeUVRow = MergeUVRow_AVX512BW;
}
}
#endif
#if defined(HAS_MERGEUVROW_NEON) #if defined(HAS_MERGEUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow = MergeUVRow_Any_NEON; MergeUVRow = MergeUVRow_Any_NEON;

View File

@ -389,6 +389,14 @@ int ARGBToNV12(const uint8_t* src_argb,
} }
} }
#endif #endif
#if defined(HAS_MERGEUVROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
MergeUVRow_ = MergeUVRow_Any_AVX512BW;
if (IS_ALIGNED(halfwidth, 32)) {
MergeUVRow_ = MergeUVRow_AVX512BW;
}
}
#endif
#if defined(HAS_MERGEUVROW_NEON) #if defined(HAS_MERGEUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow_ = MergeUVRow_Any_NEON; MergeUVRow_ = MergeUVRow_Any_NEON;
@ -559,6 +567,14 @@ int ARGBToNV21(const uint8_t* src_argb,
} }
} }
#endif #endif
#if defined(HAS_MERGEUVROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
MergeUVRow_ = MergeUVRow_Any_AVX512BW;
if (IS_ALIGNED(halfwidth, 64)) {
MergeUVRow_ = MergeUVRow_AVX512BW;
}
}
#endif
#if defined(HAS_MERGEUVROW_NEON) #if defined(HAS_MERGEUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow_ = MergeUVRow_Any_NEON; MergeUVRow_ = MergeUVRow_Any_NEON;
@ -726,6 +742,14 @@ int ABGRToNV12(const uint8_t* src_abgr,
} }
} }
#endif #endif
#if defined(HAS_MERGEUVROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
MergeUVRow_ = MergeUVRow_Any_AVX512BW;
if (IS_ALIGNED(halfwidth, 64)) {
MergeUVRow_ = MergeUVRow_AVX512BW;
}
}
#endif
#if defined(HAS_MERGEUVROW_NEON) #if defined(HAS_MERGEUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow_ = MergeUVRow_Any_NEON; MergeUVRow_ = MergeUVRow_Any_NEON;
@ -894,6 +918,14 @@ int ABGRToNV21(const uint8_t* src_abgr,
} }
} }
#endif #endif
#if defined(HAS_MERGEUVROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
MergeUVRow_ = MergeUVRow_Any_AVX512BW;
if (IS_ALIGNED(halfwidth, 64)) {
MergeUVRow_ = MergeUVRow_AVX512BW;
}
}
#endif
#if defined(HAS_MERGEUVROW_NEON) #if defined(HAS_MERGEUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow_ = MergeUVRow_Any_NEON; MergeUVRow_ = MergeUVRow_Any_NEON;
@ -2921,6 +2953,14 @@ int RAWToJNV21(const uint8_t* src_raw,
} }
} }
#endif #endif
#if defined(HAS_MERGEUVROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
MergeUVRow_ = MergeUVRow_Any_AVX512BW;
if (IS_ALIGNED(halfwidth, 64)) {
MergeUVRow_ = MergeUVRow_AVX512BW;
}
}
#endif
#if defined(HAS_MERGEUVROW_NEON) #if defined(HAS_MERGEUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow_ = MergeUVRow_Any_NEON; MergeUVRow_ = MergeUVRow_Any_NEON;

View File

@ -599,6 +599,14 @@ void MergeUVPlane(const uint8_t* src_u,
} }
} }
#endif #endif
#if defined(HAS_MERGEUVROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
MergeUVRow = MergeUVRow_Any_AVX512BW;
if (IS_ALIGNED(width, 32)) {
MergeUVRow = MergeUVRow_AVX512BW;
}
}
#endif
#if defined(HAS_MERGEUVROW_NEON) #if defined(HAS_MERGEUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow = MergeUVRow_Any_NEON; MergeUVRow = MergeUVRow_Any_NEON;

View File

@ -166,6 +166,63 @@ void TransposeWxH_16_C(const uint16_t* src,
} }
} }
// Transpose 32 bit values (ARGB)
void Transpose4x4_32_C(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width) {
const uint8_t* src1 = src + src_stride;
const uint8_t* src2 = src1 + src_stride;
const uint8_t* src3 = src2 + src_stride;
uint8_t* dst1 = dst + dst_stride;
uint8_t* dst2 = dst1 + dst_stride;
uint8_t* dst3 = dst2 + dst_stride;
int i;
for (i = 0; i < width; i += 4) {
uint32_t p00 = ((uint32_t*)(src))[0];
uint32_t p10 = ((uint32_t*)(src))[1];
uint32_t p20 = ((uint32_t*)(src))[2];
uint32_t p30 = ((uint32_t*)(src))[3];
uint32_t p01 = ((uint32_t*)(src1))[0];
uint32_t p11 = ((uint32_t*)(src1))[1];
uint32_t p21 = ((uint32_t*)(src1))[2];
uint32_t p31 = ((uint32_t*)(src1))[3];
uint32_t p02 = ((uint32_t*)(src2))[0];
uint32_t p12 = ((uint32_t*)(src2))[1];
uint32_t p22 = ((uint32_t*)(src2))[2];
uint32_t p32 = ((uint32_t*)(src2))[3];
uint32_t p03 = ((uint32_t*)(src3))[0];
uint32_t p13 = ((uint32_t*)(src3))[1];
uint32_t p23 = ((uint32_t*)(src3))[2];
uint32_t p33 = ((uint32_t*)(src3))[3];
((uint32_t*)(dst))[0] = p00;
((uint32_t*)(dst))[1] = p01;
((uint32_t*)(dst))[2] = p02;
((uint32_t*)(dst))[3] = p03;
((uint32_t*)(dst1))[0] = p10;
((uint32_t*)(dst1))[1] = p11;
((uint32_t*)(dst1))[2] = p12;
((uint32_t*)(dst1))[3] = p13;
((uint32_t*)(dst2))[0] = p20;
((uint32_t*)(dst2))[1] = p21;
((uint32_t*)(dst2))[2] = p22;
((uint32_t*)(dst2))[3] = p23;
((uint32_t*)(dst3))[0] = p30;
((uint32_t*)(dst3))[1] = p31;
((uint32_t*)(dst3))[2] = p32;
((uint32_t*)(dst3))[3] = p33;
src += src_stride * 4; // advance 4 rows
src1 += src_stride * 4;
src2 += src_stride * 4;
src3 += src_stride * 4;
dst += 4 * 4; // advance 4 columns
dst1 += 4 * 4;
dst2 += 4 * 4;
dst3 += 4 * 4;
}
}
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv

View File

@ -435,6 +435,45 @@ void TransposeUVWx8_NEON(const uint8_t* src,
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31"); "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
} }
// Transpose 32 bit values (ARGB)
void Transpose4x4_32_NEON(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width) {
const uint8_t* src1 = src + src_stride;
const uint8_t* src2 = src1 + src_stride;
const uint8_t* src3 = src2 + src_stride;
uint8_t* dst1 = dst + dst_stride;
uint8_t* dst2 = dst1 + dst_stride;
uint8_t* dst3 = dst2 + dst_stride;
asm volatile(
// Main loop transpose 4x4. Read a column, write a row.
"1: \n"
"ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
"ld4 {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n"
"ld4 {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n"
"ld4 {v0.s, v1.s, v2.s, v3.s}[3], [%3], %9 \n"
"subs %w8, %w8, #4 \n" // w -= 4
"st1 {v0.4s}, [%4], 16 \n"
"st1 {v1.4s}, [%5], 16 \n"
"st1 {v2.4s}, [%6], 16 \n"
"st1 {v3.4s}, [%7], 16 \n"
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(src1), // %1
"+r"(src2), // %2
"+r"(src3), // %3
"+r"(dst), // %4
"+r"(dst1), // %5
"+r"(dst2), // %6
"+r"(dst3), // %7
"+r"(width) // %8
: "r"((ptrdiff_t)(src_stride * 4)) // %9
: "memory", "cc", "v0", "v1", "v2", "v3");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -571,6 +571,9 @@ ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
#ifdef HAS_MERGEUVROW_AVX2 #ifdef HAS_MERGEUVROW_AVX2
ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31) ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
#endif #endif
#ifdef HAS_MERGEUVROW_AVX512BW
ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31)
#endif
#ifdef HAS_MERGEUVROW_NEON #ifdef HAS_MERGEUVROW_NEON
ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15) ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
#endif #endif

View File

@ -17,6 +17,8 @@ extern "C" {
// This module is for GCC x86 and x64. // This module is for GCC x86 and x64.
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
#include <immintrin.h>
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
// Constants for ARGB // Constants for ARGB
@ -5142,6 +5144,25 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
} }
#endif // HAS_DETILESPLITUVROW_SSSE3 #endif // HAS_DETILESPLITUVROW_SSSE3
#ifdef HAS_MERGEUVROW_AVX512BW
__attribute__ ((target("avx512vl,avx512bw")))
void MergeUVRow_AVX512BW(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width) {
do {
const __m512i u = _mm512_cvtepu8_epi16(_mm256_loadu_epi8(src_u));
const __m512i v = _mm512_slli_epi64(_mm512_cvtepu8_epi16(_mm256_loadu_epi8(src_v)), 8);
const __m512i uv = _mm512_or_si512(u, v);
_mm512_storeu_epi8(dst_uv, uv);
src_u += 32;
src_v += 32;
dst_uv += 64;
width -= 32;
} while (width > 0);
}
#endif // HAS_MERGEUVROW_AVX512BW
#ifdef HAS_MERGEUVROW_AVX2 #ifdef HAS_MERGEUVROW_AVX2
void MergeUVRow_AVX2(const uint8_t* src_u, void MergeUVRow_AVX2(const uint8_t* src_u,
const uint8_t* src_v, const uint8_t* src_v,

View File

@ -2047,10 +2047,12 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
"xvld $xr4, %0, 0 \n\t" "xvld $xr4, %0, 0 \n\t"
"xvld $xr5, %0, 32 \n\t" "xvld $xr5, %0, 32 \n\t"
"xvld $xr6, %0, 64 \n\t" "xvld $xr6, %0, 64 \n\t"
"xvld $xr7, %0, 96 \n\t" // load 32 pixels of ARGB "xvld $xr7, %0, 96 \n\t" // load 32 pixels of
// ARGB
"xvor.v $xr12, $xr3, $xr3 \n\t" "xvor.v $xr12, $xr3, $xr3 \n\t"
"xvor.v $xr13, $xr3, $xr3 \n\t" "xvor.v $xr13, $xr3, $xr3 \n\t"
"addi.d %2, %2, -32 \n\t" // 32 processed per loop. "addi.d %2, %2, -32 \n\t" // 32 processed per
// loop.
"xvpickev.b $xr8, $xr5, $xr4 \n\t" // BR "xvpickev.b $xr8, $xr5, $xr4 \n\t" // BR
"xvpickev.b $xr10, $xr7, $xr6 \n\t" "xvpickev.b $xr10, $xr7, $xr6 \n\t"
"xvpickod.b $xr9, $xr5, $xr4 \n\t" // GA "xvpickod.b $xr9, $xr5, $xr4 \n\t" // GA
@ -2070,10 +2072,8 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
: "+&r"(src_argb), // %0 : "+&r"(src_argb), // %0
"+&r"(dst_y), // %1 "+&r"(dst_y), // %1
"+&r"(width) // %2 "+&r"(width) // %2
: "r"(rgbconstants), : "r"(rgbconstants), "r"(shuff)
"r"(shuff) : "memory");
: "memory"
);
} }
void ARGBToYRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) { void ARGBToYRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@ -2109,10 +2109,12 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
"xvld $xr4, %0, 0 \n\t" "xvld $xr4, %0, 0 \n\t"
"xvld $xr5, %0, 32 \n\t" "xvld $xr5, %0, 32 \n\t"
"xvld $xr6, %0, 64 \n\t" "xvld $xr6, %0, 64 \n\t"
"xvld $xr7, %0, 96 \n\t" // load 32 pixels of RGBA "xvld $xr7, %0, 96 \n\t" // load 32 pixels of
// RGBA
"xvor.v $xr12, $xr3, $xr3 \n\t" "xvor.v $xr12, $xr3, $xr3 \n\t"
"xvor.v $xr13, $xr3, $xr3 \n\t" "xvor.v $xr13, $xr3, $xr3 \n\t"
"addi.d %2, %2, -32 \n\t" // 32 processed per loop. "addi.d %2, %2, -32 \n\t" // 32 processed per
// loop.
"xvpickev.b $xr8, $xr5, $xr4 \n\t" // AG "xvpickev.b $xr8, $xr5, $xr4 \n\t" // AG
"xvpickev.b $xr10, $xr7, $xr6 \n\t" "xvpickev.b $xr10, $xr7, $xr6 \n\t"
"xvpickod.b $xr9, $xr5, $xr4 \n\t" // BR "xvpickod.b $xr9, $xr5, $xr4 \n\t" // BR
@ -2132,10 +2134,8 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
: "+&r"(src_rgba), // %0 : "+&r"(src_rgba), // %0
"+&r"(dst_y), // %1 "+&r"(dst_y), // %1
"+&r"(width) // %2 "+&r"(width) // %2
: "r"(rgbconstants), : "r"(rgbconstants), "r"(shuff)
"r"(shuff) : "memory");
: "memory"
);
} }
void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width) { void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
@ -2154,7 +2154,8 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
uint8_t* dst_y, uint8_t* dst_y,
int width, int width,
const struct RgbConstants* rgbconstants) { const struct RgbConstants* rgbconstants) {
int8_t shuff[128] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, int8_t shuff[128] = {
0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15,
24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15,
@ -2174,11 +2175,13 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
"1: \n\t" "1: \n\t"
"xvld $xr8, %0, 0 \n\t" "xvld $xr8, %0, 0 \n\t"
"xvld $xr9, %0, 32 \n\t" "xvld $xr9, %0, 32 \n\t"
"xvld $xr10, %0, 64 \n\t" // load 32 pixels of RGB "xvld $xr10, %0, 64 \n\t" // load 32 pixels of
// RGB
"xvor.v $xr12, $xr3, $xr3 \n\t" "xvor.v $xr12, $xr3, $xr3 \n\t"
"xvor.v $xr13, $xr3, $xr3 \n\t" "xvor.v $xr13, $xr3, $xr3 \n\t"
"xvor.v $xr11, $xr9, $xr9 \n\t" "xvor.v $xr11, $xr9, $xr9 \n\t"
"addi.d %2, %2, -32 \n\t" // 32 processed per loop. "addi.d %2, %2, -32 \n\t" // 32 processed per
// loop.
"xvpermi.q $xr9, $xr8, 0x30 \n\t" // src0 "xvpermi.q $xr9, $xr8, 0x30 \n\t" // src0
"xvpermi.q $xr8, $xr10, 0x03 \n\t" // src1 "xvpermi.q $xr8, $xr10, 0x03 \n\t" // src1
"xvpermi.q $xr10, $xr11, 0x30 \n\t" // src2 "xvpermi.q $xr10, $xr11, 0x30 \n\t" // src2
@ -2202,8 +2205,7 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
"+&r"(width) // %2 "+&r"(width) // %2
: "r"(rgbconstants), // %3 : "r"(rgbconstants), // %3
"r"(shuff) // %4 "r"(shuff) // %4
: "memory" : "memory");
);
} }
void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {

View File

@ -1688,10 +1688,12 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
"vld $vr4, %0, 0 \n\t" "vld $vr4, %0, 0 \n\t"
"vld $vr5, %0, 16 \n\t" "vld $vr5, %0, 16 \n\t"
"vld $vr6, %0, 32 \n\t" "vld $vr6, %0, 32 \n\t"
"vld $vr7, %0, 48 \n\t" // load 16 pixels of ARGB "vld $vr7, %0, 48 \n\t" // load 16 pixels of
// ARGB
"vor.v $vr12, $vr3, $vr3 \n\t" "vor.v $vr12, $vr3, $vr3 \n\t"
"vor.v $vr13, $vr3, $vr3 \n\t" "vor.v $vr13, $vr3, $vr3 \n\t"
"addi.d %2, %2, -16 \n\t" // 16 processed per loop. "addi.d %2, %2, -16 \n\t" // 16 processed per
// loop.
"vpickev.b $vr8, $vr5, $vr4 \n\t" // BR "vpickev.b $vr8, $vr5, $vr4 \n\t" // BR
"vpickev.b $vr10, $vr7, $vr6 \n\t" "vpickev.b $vr10, $vr7, $vr6 \n\t"
"vpickod.b $vr9, $vr5, $vr4 \n\t" // GA "vpickod.b $vr9, $vr5, $vr4 \n\t" // GA
@ -1711,8 +1713,7 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
"+&r"(dst_y), // %1 "+&r"(dst_y), // %1
"+&r"(width) // %2 "+&r"(width) // %2
: "r"(rgbconstants) : "r"(rgbconstants)
: "memory" : "memory");
);
} }
void ARGBToYRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) { void ARGBToYRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@ -1746,10 +1747,12 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
"vld $vr4, %0, 0 \n\t" "vld $vr4, %0, 0 \n\t"
"vld $vr5, %0, 16 \n\t" "vld $vr5, %0, 16 \n\t"
"vld $vr6, %0, 32 \n\t" "vld $vr6, %0, 32 \n\t"
"vld $vr7, %0, 48 \n\t" // load 16 pixels of RGBA "vld $vr7, %0, 48 \n\t" // load 16 pixels of
// RGBA
"vor.v $vr12, $vr3, $vr3 \n\t" "vor.v $vr12, $vr3, $vr3 \n\t"
"vor.v $vr13, $vr3, $vr3 \n\t" "vor.v $vr13, $vr3, $vr3 \n\t"
"addi.d %2, %2, -16 \n\t" // 16 processed per loop. "addi.d %2, %2, -16 \n\t" // 16 processed per
// loop.
"vpickev.b $vr8, $vr5, $vr4 \n\t" // AG "vpickev.b $vr8, $vr5, $vr4 \n\t" // AG
"vpickev.b $vr10, $vr7, $vr6 \n\t" "vpickev.b $vr10, $vr7, $vr6 \n\t"
"vpickod.b $vr9, $vr5, $vr4 \n\t" // BR "vpickod.b $vr9, $vr5, $vr4 \n\t" // BR
@ -1769,8 +1772,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
"+&r"(dst_y), // %1 "+&r"(dst_y), // %1
"+&r"(width) // %2 "+&r"(width) // %2
: "r"(rgbconstants) : "r"(rgbconstants)
: "memory" : "memory");
);
} }
void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) { void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
@ -1789,10 +1791,11 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
uint8_t* dst_y, uint8_t* dst_y,
int width, int width,
const struct RgbConstants* rgbconstants) { const struct RgbConstants* rgbconstants) {
int8_t shuff[64] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, int8_t shuff[64] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18,
24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, 20, 21, 23, 24, 26, 27, 29, 30, 0, 1, 3, 4, 6,
1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, 7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10,
25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; 0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0,
31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
asm volatile( asm volatile(
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
@ -1805,10 +1808,12 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
"1: \n\t" "1: \n\t"
"vld $vr8, %0, 0 \n\t" "vld $vr8, %0, 0 \n\t"
"vld $vr9, %0, 16 \n\t" "vld $vr9, %0, 16 \n\t"
"vld $vr10, %0, 32 \n\t" // load 16 pixels of RGB "vld $vr10, %0, 32 \n\t" // load 16 pixels of
// RGB
"vor.v $vr12, $vr3, $vr3 \n\t" "vor.v $vr12, $vr3, $vr3 \n\t"
"vor.v $vr13, $vr3, $vr3 \n\t" "vor.v $vr13, $vr3, $vr3 \n\t"
"addi.d %2, %2, -16 \n\t" // 16 processed per loop. "addi.d %2, %2, -16 \n\t" // 16 processed per
// loop.
"vshuf.b $vr14, $vr9, $vr8, $vr4 \n\t" "vshuf.b $vr14, $vr9, $vr8, $vr4 \n\t"
"vshuf.b $vr15, $vr9, $vr10, $vr5 \n\t" "vshuf.b $vr15, $vr9, $vr10, $vr5 \n\t"
"vshuf.b $vr16, $vr9, $vr8, $vr6 \n\t" "vshuf.b $vr16, $vr9, $vr8, $vr6 \n\t"
@ -1829,8 +1834,7 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
"+&r"(width) // %2 "+&r"(width) // %2
: "r"(rgbconstants), // %3 : "r"(rgbconstants), // %3
"r"(shuff) // %4 "r"(shuff) // %4
: "memory" : "memory");
);
} }
void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {

View File

@ -820,6 +820,28 @@ void MergeUVRow_NEON(const uint8_t* src_u,
: "cc", "memory", "v0", "v1" // Clobber List : "cc", "memory", "v0", "v1" // Clobber List
); );
} }
// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUVRow_NEON1(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width) {
asm volatile(
"1: \n"
"ld1 {v0.16b,v2.16b}, [%0], #32 \n" // load U
"ld1 {v1.16b,v3.16b}, [%1], #32 \n" // load V
"subs %w3, %w3, #32 \n" // 32 processed per loop
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"st2 {v0.16b,v1.16b,v2.16b,v3.16b}, [%2], #64 \n" // store 32 UV
"b.gt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3 // Output registers
: // Input registers
: "cc", "memory", "v0", "v1" // Clobber List
);
}
void MergeUVRow_16_NEON(const uint16_t* src_u, void MergeUVRow_16_NEON(const uint16_t* src_u,
const uint16_t* src_v, const uint16_t* src_v,

View File

@ -14,6 +14,10 @@
#include "libyuv/cpu_id.h" #include "libyuv/cpu_id.h"
#include "libyuv/rotate.h" #include "libyuv/rotate.h"
#ifdef ENABLE_ROW_TESTS
#include "libyuv/rotate_row.h"
#endif
namespace libyuv { namespace libyuv {
#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a)) #define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
@ -858,4 +862,47 @@ TEST_F(LibYUVRotateTest, I410Rotate270_Opt) {
disable_cpu_flags_, benchmark_cpu_info_); disable_cpu_flags_, benchmark_cpu_info_);
} }
#if defined(ENABLE_ROW_TESTS)
TEST_F(LibYUVRotateTest, Transpose4x4) {
// dst width and height
const int width = ((benchmark_width_ * benchmark_height_ + 3) / 4 + 3) & ~3;
const int height = 4;
align_buffer_page_end(src_pixels, height * width * 4);
align_buffer_page_end(dst_pixels_c, width * height * 4);
align_buffer_page_end(dst_pixels_opt, width * height * 4);
MemRandomize(src_pixels, height * width * 4);
memset(dst_pixels_c, 1, width * height * 4);
memset(dst_pixels_opt, 1, width * height * 4);
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
(uint8_t*)dst_pixels_c, width * 4, width);
for (int i = 0; i < benchmark_iterations_; ++i) {
#if defined(__aarch64__)
if (TestCpuFlag(kCpuHasNEON)) {
Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
(uint8_t*)dst_pixels_opt, width * 4, width);
} else {
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
(uint8_t*)dst_pixels_opt, width * 4, width);
}
#else
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
(uint8_t*)dst_pixels_opt, width * 4, width);
#endif
}
// for (int i = 0; i < width * height; ++i) {
// EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
// }
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(dst_pixels_c);
free_aligned_buffer_page_end(dst_pixels_opt);
}
#endif // ENABLE_ROW_TESTS
} // namespace libyuv } // namespace libyuv