mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
ARGBAttenuate use (a + b + 255) >> 8
- Makes ARM and Intel match and fixes some off by 1 cases - Add ARGBToUV444MatrixRow_NEON - Add ConvertFP16ToFP32Column_NEON - scale_rvv fix intinsic build error - disable row_win version of ARGBAttenuate/Unattenuate Bug: libyuv:936, libyuv:956 Change-Id: Ied99aaad3a11a8eb69212b628c58f86ec0723c38 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4617013 Reviewed-by: Wan-Teh Chang <wtc@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
04821d1e7d
commit
a366ad714a
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 1872
|
Version: 1873
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -161,7 +161,6 @@ extern "C" {
|
|||||||
#define HAS_ARGBSEPIAROW_SSSE3
|
#define HAS_ARGBSEPIAROW_SSSE3
|
||||||
#define HAS_ARGBSHADEROW_SSE2
|
#define HAS_ARGBSHADEROW_SSE2
|
||||||
#define HAS_ARGBSUBTRACTROW_SSE2
|
#define HAS_ARGBSUBTRACTROW_SSE2
|
||||||
#define HAS_ARGBUNATTENUATEROW_SSE2
|
|
||||||
#define HAS_BLENDPLANEROW_SSSE3
|
#define HAS_BLENDPLANEROW_SSSE3
|
||||||
#define HAS_COMPUTECUMULATIVESUMROW_SSE2
|
#define HAS_COMPUTECUMULATIVESUMROW_SSE2
|
||||||
#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
|
#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
|
||||||
@ -171,9 +170,6 @@ extern "C" {
|
|||||||
#define HAS_SOBELXROW_SSE2
|
#define HAS_SOBELXROW_SSE2
|
||||||
#define HAS_SOBELXYROW_SSE2
|
#define HAS_SOBELXYROW_SSE2
|
||||||
#define HAS_SOBELYROW_SSE2
|
#define HAS_SOBELYROW_SSE2
|
||||||
#if !defined(LIBYUV_BIT_EXACT)
|
|
||||||
#define HAS_ARGBATTENUATEROW_SSSE3
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// The following functions fail on gcc/clang 32 bit with fpic and framepointer.
|
// The following functions fail on gcc/clang 32 bit with fpic and framepointer.
|
||||||
// caveat: clangcl uses row_win.cc which works.
|
// caveat: clangcl uses row_win.cc which works.
|
||||||
@ -241,11 +237,7 @@ extern "C" {
|
|||||||
#define HAS_ARGBADDROW_AVX2
|
#define HAS_ARGBADDROW_AVX2
|
||||||
#define HAS_ARGBMULTIPLYROW_AVX2
|
#define HAS_ARGBMULTIPLYROW_AVX2
|
||||||
#define HAS_ARGBSUBTRACTROW_AVX2
|
#define HAS_ARGBSUBTRACTROW_AVX2
|
||||||
#define HAS_ARGBUNATTENUATEROW_AVX2
|
|
||||||
#define HAS_BLENDPLANEROW_AVX2
|
#define HAS_BLENDPLANEROW_AVX2
|
||||||
#if !defined(LIBYUV_BIT_EXACT)
|
|
||||||
#define HAS_ARGBATTENUATEROW_AVX2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
|
#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
|
||||||
defined(_MSC_VER)
|
defined(_MSC_VER)
|
||||||
@ -285,14 +277,15 @@ extern "C" {
|
|||||||
#define HAS_ABGRTOAR30ROW_SSSE3
|
#define HAS_ABGRTOAR30ROW_SSSE3
|
||||||
#define HAS_ABGRTOYJROW_SSSE3
|
#define HAS_ABGRTOYJROW_SSSE3
|
||||||
#define HAS_AR64TOARGBROW_SSSE3
|
#define HAS_AR64TOARGBROW_SSSE3
|
||||||
|
#define HAS_ARGBATTENUATEROW_SSSE3
|
||||||
#define HAS_ARGBTOAB64ROW_SSSE3
|
#define HAS_ARGBTOAB64ROW_SSSE3
|
||||||
#define HAS_ARGBTOAR30ROW_SSSE3
|
#define HAS_ARGBTOAR30ROW_SSSE3
|
||||||
#define HAS_ARGBTOAR64ROW_SSSE3
|
#define HAS_ARGBTOAR64ROW_SSSE3
|
||||||
|
#define HAS_ARGBUNATTENUATEROW_SSE2
|
||||||
#define HAS_CONVERT16TO8ROW_SSSE3
|
#define HAS_CONVERT16TO8ROW_SSSE3
|
||||||
#define HAS_CONVERT8TO16ROW_SSE2
|
#define HAS_CONVERT8TO16ROW_SSE2
|
||||||
#define HAS_DETILEROW_SSE2
|
|
||||||
#define HAS_DETILEROW_16_SSE2
|
#define HAS_DETILEROW_16_SSE2
|
||||||
#define HAS_DETILEROW_16_AVX
|
#define HAS_DETILEROW_SSE2
|
||||||
#define HAS_DETILESPLITUVROW_SSSE3
|
#define HAS_DETILESPLITUVROW_SSSE3
|
||||||
#define HAS_DETILETOYUY2_SSE2
|
#define HAS_DETILETOYUY2_SSE2
|
||||||
#define HAS_HALFMERGEUVROW_SSSE3
|
#define HAS_HALFMERGEUVROW_SSSE3
|
||||||
@ -345,13 +338,16 @@ extern "C" {
|
|||||||
#define HAS_ABGRTOYJROW_AVX2
|
#define HAS_ABGRTOYJROW_AVX2
|
||||||
#define HAS_ABGRTOYROW_AVX2
|
#define HAS_ABGRTOYROW_AVX2
|
||||||
#define HAS_AR64TOARGBROW_AVX2
|
#define HAS_AR64TOARGBROW_AVX2
|
||||||
|
#define HAS_ARGBATTENUATEROW_AVX2
|
||||||
#define HAS_ARGBTOAB64ROW_AVX2
|
#define HAS_ARGBTOAB64ROW_AVX2
|
||||||
#define HAS_ARGBTOAR30ROW_AVX2
|
#define HAS_ARGBTOAR30ROW_AVX2
|
||||||
#define HAS_ARGBTOAR64ROW_AVX2
|
#define HAS_ARGBTOAR64ROW_AVX2
|
||||||
#define HAS_ARGBTORAWROW_AVX2
|
#define HAS_ARGBTORAWROW_AVX2
|
||||||
#define HAS_ARGBTORGB24ROW_AVX2
|
#define HAS_ARGBTORGB24ROW_AVX2
|
||||||
|
#define HAS_ARGBUNATTENUATEROW_AVX2
|
||||||
#define HAS_CONVERT16TO8ROW_AVX2
|
#define HAS_CONVERT16TO8ROW_AVX2
|
||||||
#define HAS_CONVERT8TO16ROW_AVX2
|
#define HAS_CONVERT8TO16ROW_AVX2
|
||||||
|
#define HAS_DETILEROW_16_AVX
|
||||||
#define HAS_DIVIDEROW_16_AVX2
|
#define HAS_DIVIDEROW_16_AVX2
|
||||||
#define HAS_HALFMERGEUVROW_AVX2
|
#define HAS_HALFMERGEUVROW_AVX2
|
||||||
#define HAS_I210TOAR30ROW_AVX2
|
#define HAS_I210TOAR30ROW_AVX2
|
||||||
@ -6190,6 +6186,11 @@ void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr,
|
|||||||
void ConvertFP16ToFP32Row_NEON(const uint16_t* src, // fp16
|
void ConvertFP16ToFP32Row_NEON(const uint16_t* src, // fp16
|
||||||
float* dst,
|
float* dst,
|
||||||
int width);
|
int width);
|
||||||
|
// Convert a column of FP16 Half Floats to a row of FP32 Floats
|
||||||
|
void ConvertFP16ToFP32Column_NEON(const uint16_t* src, // fp16
|
||||||
|
int src_stride, // stride in elements
|
||||||
|
float* dst,
|
||||||
|
int width);
|
||||||
// Convert FP32 Floats to FP16 Half Floats
|
// Convert FP32 Floats to FP16 Half Floats
|
||||||
void ConvertFP32ToFP16Row_NEON(const float* src,
|
void ConvertFP32ToFP16Row_NEON(const float* src,
|
||||||
uint16_t* dst, // fp16
|
uint16_t* dst, // fp16
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1872
|
#define LIBYUV_VERSION 1873
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|||||||
@ -48,7 +48,6 @@ extern "C" {
|
|||||||
defined(__i386__) || defined(_M_IX86))
|
defined(__i386__) || defined(_M_IX86))
|
||||||
#define LIBYUV_ARGBTOUV_PAVGB 1
|
#define LIBYUV_ARGBTOUV_PAVGB 1
|
||||||
#define LIBYUV_RGBTOU_TRUNCATE 1
|
#define LIBYUV_RGBTOU_TRUNCATE 1
|
||||||
#define LIBYUV_ATTENUATE_DUP 1
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(LIBYUV_BIT_EXACT)
|
#if defined(LIBYUV_BIT_EXACT)
|
||||||
#define LIBYUV_UNATTENUATE_DUP 1
|
#define LIBYUV_UNATTENUATE_DUP 1
|
||||||
@ -3369,12 +3368,7 @@ void BlendPlaneRow_C(const uint8_t* src0,
|
|||||||
}
|
}
|
||||||
#undef UBLEND
|
#undef UBLEND
|
||||||
|
|
||||||
#if LIBYUV_ATTENUATE_DUP
|
#define ATTENUATE(f, a) (f * a + 255) >> 8
|
||||||
// This code mimics the SSSE3 version for better testability.
|
|
||||||
#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
|
|
||||||
#else
|
|
||||||
#define ATTENUATE(f, a) (f * a + 128) >> 8
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Multiply source RGB by alpha and store to destination.
|
// Multiply source RGB by alpha and store to destination.
|
||||||
void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
|
void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
|
||||||
|
|||||||
@ -7441,83 +7441,95 @@ void BlendPlaneRow_AVX2(const uint8_t* src0,
|
|||||||
|
|
||||||
#ifdef HAS_ARGBATTENUATEROW_SSSE3
|
#ifdef HAS_ARGBATTENUATEROW_SSSE3
|
||||||
// Shuffle table duplicating alpha.
|
// Shuffle table duplicating alpha.
|
||||||
static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
|
static const vec8 kAttenuateShuffle = {6, -128, 6, -128, 6, -128,
|
||||||
7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
|
-128, -128, 14, -128, 14, -128,
|
||||||
static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
|
14, -128, -128, -128};
|
||||||
15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
|
|
||||||
// Attenuate 4 pixels at a time.
|
// Attenuate 4 pixels at a time.
|
||||||
void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
|
void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"pcmpeqb %%xmm3,%%xmm3 \n"
|
|
||||||
"pslld $0x18,%%xmm3 \n"
|
|
||||||
"movdqa %3,%%xmm4 \n"
|
"movdqa %3,%%xmm4 \n"
|
||||||
"movdqa %4,%%xmm5 \n"
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||||
|
"pslld $0x18,%%xmm5 \n"
|
||||||
|
"pxor %%xmm6,%%xmm6 \n"
|
||||||
|
"pcmpeqb %%xmm7,%%xmm7 \n"
|
||||||
|
"punpcklbw %%xmm6,%%xmm7 \n"
|
||||||
|
"sub %0,%1 \n"
|
||||||
|
|
||||||
// 4 pixel loop.
|
// 4 pixel loop.
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm0 \n"
|
"movdqu (%0),%%xmm6 \n"
|
||||||
"pshufb %%xmm4,%%xmm0 \n"
|
"movdqa %%xmm6,%%xmm0 \n"
|
||||||
"movdqu (%0),%%xmm1 \n"
|
"movdqa %%xmm6,%%xmm1 \n"
|
||||||
"punpcklbw %%xmm1,%%xmm1 \n"
|
"punpcklbw %%xmm5,%%xmm0 \n"
|
||||||
"pmulhuw %%xmm1,%%xmm0 \n"
|
"punpckhbw %%xmm5,%%xmm1 \n"
|
||||||
"movdqu (%0),%%xmm1 \n"
|
"movdqa %%xmm0,%%xmm2 \n"
|
||||||
"pshufb %%xmm5,%%xmm1 \n"
|
"movdqa %%xmm1,%%xmm3 \n"
|
||||||
"movdqu (%0),%%xmm2 \n"
|
"pshufb %%xmm4,%%xmm2 \n" // a,a,a,0
|
||||||
"punpckhbw %%xmm2,%%xmm2 \n"
|
"pshufb %%xmm4,%%xmm3 \n"
|
||||||
"pmulhuw %%xmm2,%%xmm1 \n"
|
"pmullw %%xmm2,%%xmm0 \n" // rgb * alpha
|
||||||
"movdqu (%0),%%xmm2 \n"
|
"pmullw %%xmm3,%%xmm1 \n"
|
||||||
"lea 0x10(%0),%0 \n"
|
"paddw %%xmm7,%%xmm0 \n" // + 255
|
||||||
"pand %%xmm3,%%xmm2 \n"
|
"paddw %%xmm7,%%xmm1 \n"
|
||||||
"psrlw $0x8,%%xmm0 \n"
|
"psrlw $0x8,%%xmm0 \n"
|
||||||
"psrlw $0x8,%%xmm1 \n"
|
"psrlw $0x8,%%xmm1 \n"
|
||||||
"packuswb %%xmm1,%%xmm0 \n"
|
"packuswb %%xmm1,%%xmm0 \n"
|
||||||
"por %%xmm2,%%xmm0 \n"
|
"pand %%xmm5,%%xmm6 \n"
|
||||||
"movdqu %%xmm0,(%1) \n"
|
"por %%xmm6,%%xmm0 \n"
|
||||||
"lea 0x10(%1),%1 \n"
|
"movdqu %%xmm0,(%0,%1) \n"
|
||||||
|
"lea 0x10(%0),%0 \n"
|
||||||
"sub $0x4,%2 \n"
|
"sub $0x4,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(dst_argb), // %1
|
"+r"(dst_argb), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
: "m"(kShuffleAlpha0), // %3
|
: "m"(kAttenuateShuffle) // %3
|
||||||
"m"(kShuffleAlpha1) // %4
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
"xmm7");
|
||||||
}
|
}
|
||||||
#endif // HAS_ARGBATTENUATEROW_SSSE3
|
#endif // HAS_ARGBATTENUATEROW_SSSE3
|
||||||
|
|
||||||
#ifdef HAS_ARGBATTENUATEROW_AVX2
|
#ifdef HAS_ARGBATTENUATEROW_AVX2
|
||||||
|
|
||||||
// Shuffle table duplicating alpha.
|
// Shuffle table duplicating alpha.
|
||||||
static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
|
static const lvec8 kAttenuateShuffle_AVX2 = {
|
||||||
128u, 128u, 14u, 15u, 14u, 15u,
|
6, -128, 6, -128, 6, -128, -128, -128, 14, -128, 14,
|
||||||
14u, 15u, 128u, 128u};
|
-128, 14, -128, -128, -128, 22, -128, 22, -128, 22, -128,
|
||||||
|
-128, -128, 30, -128, 30, -128, 30, -128, -128, -128};
|
||||||
|
|
||||||
// Attenuate 8 pixels at a time.
|
// Attenuate 8 pixels at a time.
|
||||||
void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
|
void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"vbroadcastf128 %3,%%ymm4 \n"
|
"vmovdqa %3,%%ymm4 \n"
|
||||||
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
|
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
|
||||||
"vpslld $0x18,%%ymm5,%%ymm5 \n"
|
"vpslld $0x18,%%ymm5,%%ymm5 \n"
|
||||||
|
"vpxor %%ymm6,%%ymm6,%%ymm6 \n"
|
||||||
|
"vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n"
|
||||||
|
"vpunpcklbw %%ymm6,%%ymm7,%%ymm7 \n"
|
||||||
"sub %0,%1 \n"
|
"sub %0,%1 \n"
|
||||||
|
|
||||||
// 8 pixel loop.
|
// 8 pixel loop.
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vmovdqu (%0),%%ymm6 \n"
|
"vmovdqu (%0),%%ymm6 \n"
|
||||||
"vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
|
"vpunpcklbw %%ymm5,%%ymm6,%%ymm0 \n"
|
||||||
"vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
|
"vpunpckhbw %%ymm5,%%ymm6,%%ymm1 \n"
|
||||||
"vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
|
"vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
|
||||||
"vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
|
"vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
|
||||||
"vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
|
"vpmullw %%ymm2,%%ymm0,%%ymm0 \n"
|
||||||
"vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
|
"vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
|
||||||
"vpand %%ymm5,%%ymm6,%%ymm6 \n"
|
"vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
|
||||||
|
"vpaddw %%ymm7,%%ymm1,%%ymm1 \n"
|
||||||
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
|
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
|
||||||
"vpsrlw $0x8,%%ymm1,%%ymm1 \n"
|
"vpsrlw $0x8,%%ymm1,%%ymm1 \n"
|
||||||
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
|
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
|
||||||
"vpor %%ymm6,%%ymm0,%%ymm0 \n"
|
"vpand %%ymm5,%%ymm6,%%ymm1 \n"
|
||||||
|
"vpor %%ymm1,%%ymm0,%%ymm0 \n"
|
||||||
"vmovdqu %%ymm0,0x00(%0,%1,1) \n"
|
"vmovdqu %%ymm0,0x00(%0,%1,1) \n"
|
||||||
"lea 0x20(%0),%0 \n"
|
"lea 0x20(%0),%0 \n"
|
||||||
"sub $0x8,%2 \n"
|
"sub $0x8,%2 \n"
|
||||||
@ -7526,8 +7538,9 @@ void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
|
|||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(dst_argb), // %1
|
"+r"(dst_argb), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
: "m"(kShuffleAlpha_AVX2) // %3
|
: "m"(kAttenuateShuffle_AVX2) // %3
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||||
|
"xmm7");
|
||||||
}
|
}
|
||||||
#endif // HAS_ARGBATTENUATEROW_AVX2
|
#endif // HAS_ARGBATTENUATEROW_AVX2
|
||||||
|
|
||||||
|
|||||||
@ -1827,19 +1827,27 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct RgbUVConstants {
|
||||||
|
uint8_t kRGBToU[4];
|
||||||
|
uint8_t kRGBToV[4];
|
||||||
|
};
|
||||||
|
|
||||||
// 8x1 pixels.
|
// 8x1 pixels.
|
||||||
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
|
||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width) {
|
int width,
|
||||||
|
const struct RgbUVConstants* rgbuvconstants) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"vmov.u8 d24, #112 \n" // UB / VR 0.875
|
|
||||||
// coefficient
|
"vld1.8 {d0}, [%4] \n" // load rgbuvconstants
|
||||||
"vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
|
"vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient
|
||||||
"vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
|
"vdup.u8 d25, d0[1] \n" // UG -0.5781 coefficient
|
||||||
"vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
|
"vdup.u8 d26, d0[2] \n" // UR -0.2969 coefficient
|
||||||
"vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
|
"vdup.u8 d27, d0[4] \n" // VB -0.1406 coefficient
|
||||||
|
"vdup.u8 d28, d0[5] \n" // VG -0.7344 coefficient
|
||||||
"vmov.u16 q15, #0x8080 \n" // 128.5
|
"vmov.u16 q15, #0x8080 \n" // 128.5
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
||||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||||
@ -1861,11 +1869,49 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
|||||||
"+r"(dst_u), // %1
|
"+r"(dst_u), // %1
|
||||||
"+r"(dst_v), // %2
|
"+r"(dst_v), // %2
|
||||||
"+r"(width) // %3
|
"+r"(width) // %3
|
||||||
:
|
: "r"(rgbuvconstants) // %4
|
||||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
|
||||||
"q15");
|
"q15");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RGB to bt601 coefficients
|
||||||
|
// UB 0.875 coefficient = 112
|
||||||
|
// UG -0.5781 coefficient = 74
|
||||||
|
// UR -0.2969 coefficient = 38
|
||||||
|
// VB -0.1406 coefficient = 18
|
||||||
|
// VG -0.7344 coefficient = 94
|
||||||
|
// VR 0.875 coefficient = 112 (ignored)
|
||||||
|
|
||||||
|
static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
|
||||||
|
{18, 94, 112, 0}};
|
||||||
|
|
||||||
|
// RGB to JPeg coefficients
|
||||||
|
// UB coeff 0.500 = 127
|
||||||
|
// UG coeff -0.33126 = 84
|
||||||
|
// UR coeff -0.16874 = 43
|
||||||
|
// VB coeff -0.08131 = 20
|
||||||
|
// VG coeff -0.41869 = 107
|
||||||
|
// VR coeff 0.500 = 127 (ignored)
|
||||||
|
|
||||||
|
static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0},
|
||||||
|
{20, 107, 127, 0}};
|
||||||
|
|
||||||
|
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width) {
|
||||||
|
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
|
||||||
|
&kRgb24I601UVConstants);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width) {
|
||||||
|
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
|
||||||
|
&kRgb24JPegUVConstants);
|
||||||
|
}
|
||||||
|
|
||||||
// clang-format off
|
// clang-format off
|
||||||
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
|
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
|
||||||
#define RGBTOUV(QB, QG, QR) \
|
#define RGBTOUV(QB, QG, QR) \
|
||||||
@ -2702,7 +2748,6 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
|
|||||||
struct RgbConstants {
|
struct RgbConstants {
|
||||||
uint8_t kRGBToY[4];
|
uint8_t kRGBToY[4];
|
||||||
uint16_t kAddY;
|
uint16_t kAddY;
|
||||||
uint16_t pad;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// RGB to JPeg coefficients
|
// RGB to JPeg coefficients
|
||||||
@ -2710,11 +2755,9 @@ struct RgbConstants {
|
|||||||
// G * 0.5870 coefficient = 150
|
// G * 0.5870 coefficient = 150
|
||||||
// R * 0.2990 coefficient = 77
|
// R * 0.2990 coefficient = 77
|
||||||
// Add 0.5 = 0x80
|
// Add 0.5 = 0x80
|
||||||
static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
|
static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
|
||||||
128,
|
|
||||||
0};
|
|
||||||
|
|
||||||
static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
|
static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
|
||||||
|
|
||||||
// RGB to BT.601 coefficients
|
// RGB to BT.601 coefficients
|
||||||
// B * 0.1016 coefficient = 25
|
// B * 0.1016 coefficient = 25
|
||||||
@ -2723,12 +2766,9 @@ static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
|
|||||||
// Add 16.5 = 0x1080
|
// Add 16.5 = 0x1080
|
||||||
|
|
||||||
static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
|
static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
|
||||||
0x1080,
|
0x1080};
|
||||||
0};
|
|
||||||
|
|
||||||
static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
|
static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
|
||||||
0x1080,
|
|
||||||
0};
|
|
||||||
|
|
||||||
// ARGB expects first 3 values to contain RGB and 4th value is ignored.
|
// ARGB expects first 3 values to contain RGB and 4th value is ignored.
|
||||||
void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
||||||
@ -3058,6 +3098,8 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
|
"vmov.u16 q15, #0x00ff \n" // 255 for rounding up
|
||||||
|
|
||||||
// Attenuate 8 pixels.
|
// Attenuate 8 pixels.
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
|
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
|
||||||
@ -3065,16 +3107,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
|
|||||||
"vmull.u8 q10, d0, d3 \n" // b * a
|
"vmull.u8 q10, d0, d3 \n" // b * a
|
||||||
"vmull.u8 q11, d1, d3 \n" // g * a
|
"vmull.u8 q11, d1, d3 \n" // g * a
|
||||||
"vmull.u8 q12, d2, d3 \n" // r * a
|
"vmull.u8 q12, d2, d3 \n" // r * a
|
||||||
"vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
|
"vaddhn.u16 d0, q10, q15 \n" // (b + 255) >> 8
|
||||||
"vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
|
"vaddhn.u16 d1, q11, q15 \n" // (g + 255) >> 8
|
||||||
"vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
|
"vaddhn.u16 d2, q12, q15 \n" // (r + 255) >> 8
|
||||||
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
|
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(dst_argb), // %1
|
"+r"(dst_argb), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
:
|
:
|
||||||
: "cc", "memory", "q0", "q1", "q10", "q11", "q12");
|
: "cc", "memory", "q0", "q1", "q10", "q11", "q12", "q15");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Quantize 8 ARGB pixels (32 bytes).
|
// Quantize 8 ARGB pixels (32 bytes).
|
||||||
|
|||||||
@ -2198,19 +2198,26 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct RgbUVConstants {
|
||||||
|
uint8_t kRGBToU[4];
|
||||||
|
uint8_t kRGBToV[4];
|
||||||
|
};
|
||||||
|
|
||||||
// 8x1 pixels.
|
// 8x1 pixels.
|
||||||
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
|
||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width) {
|
int width,
|
||||||
|
const struct RgbUVConstants* rgbuvconstants) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"movi v24.8b, #112 \n" // UB / VR 0.875
|
"ldr d0, [%4] \n" // load rgbuvconstants
|
||||||
// coefficient
|
"dup v24.16b, v0.b[0] \n" // UB 0.875 coefficient
|
||||||
"movi v25.8b, #74 \n" // UG -0.5781 coefficient
|
"dup v25.16b, v0.b[1] \n" // UG -0.5781 coefficient
|
||||||
"movi v26.8b, #38 \n" // UR -0.2969 coefficient
|
"dup v26.16b, v0.b[2] \n" // UR -0.2969 coefficient
|
||||||
"movi v27.8b, #18 \n" // VB -0.1406 coefficient
|
"dup v27.16b, v0.b[4] \n" // VB -0.1406 coefficient
|
||||||
"movi v28.8b, #94 \n" // VG -0.7344 coefficient
|
"dup v28.16b, v0.b[5] \n" // VG -0.7344 coefficient
|
||||||
"movi v29.16b, #0x80 \n" // 128.5
|
"movi v29.16b, #0x80 \n" // 128.5
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
||||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||||
@ -2233,11 +2240,49 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
|||||||
"+r"(dst_u), // %1
|
"+r"(dst_u), // %1
|
||||||
"+r"(dst_v), // %2
|
"+r"(dst_v), // %2
|
||||||
"+r"(width) // %3
|
"+r"(width) // %3
|
||||||
:
|
: "r"(rgbuvconstants) // %4
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
|
||||||
"v27", "v28", "v29");
|
"v27", "v28", "v29");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RGB to bt601 coefficients
|
||||||
|
// UB 0.875 coefficient = 112
|
||||||
|
// UG -0.5781 coefficient = 74
|
||||||
|
// UR -0.2969 coefficient = 38
|
||||||
|
// VB -0.1406 coefficient = 18
|
||||||
|
// VG -0.7344 coefficient = 94
|
||||||
|
// VR 0.875 coefficient = 112 (ignored)
|
||||||
|
|
||||||
|
static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
|
||||||
|
{18, 94, 112, 0}};
|
||||||
|
|
||||||
|
// RGB to JPeg coefficients
|
||||||
|
// UB coeff 0.500 = 127
|
||||||
|
// UG coeff -0.33126 = 84
|
||||||
|
// UR coeff -0.16874 = 43
|
||||||
|
// VB coeff -0.08131 = 20
|
||||||
|
// VG coeff -0.41869 = 107
|
||||||
|
// VR coeff 0.500 = 127 (ignored)
|
||||||
|
|
||||||
|
static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0},
|
||||||
|
{20, 107, 127, 0}};
|
||||||
|
|
||||||
|
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width) {
|
||||||
|
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
|
||||||
|
&kRgb24I601UVConstants);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width) {
|
||||||
|
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
|
||||||
|
&kRgb24JPegUVConstants);
|
||||||
|
}
|
||||||
|
|
||||||
#define RGBTOUV_SETUP_REG \
|
#define RGBTOUV_SETUP_REG \
|
||||||
"movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
|
"movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
|
||||||
"movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
|
"movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
|
||||||
@ -2943,34 +2988,8 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
|
|||||||
struct RgbConstants {
|
struct RgbConstants {
|
||||||
uint8_t kRGBToY[4];
|
uint8_t kRGBToY[4];
|
||||||
uint16_t kAddY;
|
uint16_t kAddY;
|
||||||
uint16_t pad;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// RGB to JPeg coefficients
|
|
||||||
// B * 0.1140 coefficient = 29
|
|
||||||
// G * 0.5870 coefficient = 150
|
|
||||||
// R * 0.2990 coefficient = 77
|
|
||||||
// Add 0.5 = 0x80
|
|
||||||
static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
|
|
||||||
128,
|
|
||||||
0};
|
|
||||||
|
|
||||||
static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
|
|
||||||
|
|
||||||
// RGB to BT.601 coefficients
|
|
||||||
// B * 0.1016 coefficient = 25
|
|
||||||
// G * 0.5078 coefficient = 129
|
|
||||||
// R * 0.2578 coefficient = 66
|
|
||||||
// Add 16.5 = 0x1080
|
|
||||||
|
|
||||||
static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
|
|
||||||
0x1080,
|
|
||||||
0};
|
|
||||||
|
|
||||||
static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
|
|
||||||
0x1080,
|
|
||||||
0};
|
|
||||||
|
|
||||||
// ARGB expects first 3 values to contain RGB and 4th value is ignored.
|
// ARGB expects first 3 values to contain RGB and 4th value is ignored.
|
||||||
void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
||||||
uint8_t* dst_y,
|
uint8_t* dst_y,
|
||||||
@ -3005,6 +3024,26 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
|||||||
"v17");
|
"v17");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RGB to JPeg coefficients
|
||||||
|
// B * 0.1140 coefficient = 29
|
||||||
|
// G * 0.5870 coefficient = 150
|
||||||
|
// R * 0.2990 coefficient = 77
|
||||||
|
// Add 0.5 = 0x80
|
||||||
|
static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
|
||||||
|
|
||||||
|
static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
|
||||||
|
|
||||||
|
// RGB to BT.601 coefficients
|
||||||
|
// B * 0.1016 coefficient = 25
|
||||||
|
// G * 0.5078 coefficient = 129
|
||||||
|
// R * 0.2578 coefficient = 66
|
||||||
|
// Add 16.5 = 0x1080
|
||||||
|
|
||||||
|
static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
|
||||||
|
0x1080};
|
||||||
|
|
||||||
|
static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
|
||||||
|
|
||||||
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||||
ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
|
ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
|
||||||
}
|
}
|
||||||
@ -3402,6 +3441,8 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
|
"movi v7.8h, #0x00ff \n" // 255 for rounding up
|
||||||
|
|
||||||
// Attenuate 8 pixels.
|
// Attenuate 8 pixels.
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
||||||
@ -3410,16 +3451,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
|
|||||||
"prfm pldl1keep, [%0, 448] \n"
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
"umull v5.8h, v1.8b, v3.8b \n" // g * a
|
"umull v5.8h, v1.8b, v3.8b \n" // g * a
|
||||||
"umull v6.8h, v2.8b, v3.8b \n" // r * a
|
"umull v6.8h, v2.8b, v3.8b \n" // r * a
|
||||||
"uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
|
"addhn v0.8b, v4.8h, v7.8h \n" // (b + 255) >> 8
|
||||||
"uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
|
"addhn v1.8b, v5.8h, v7.8h \n" // (g + 255) >> 8
|
||||||
"uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
|
"addhn v2.8b, v6.8h, v7.8h \n" // (r + 255) >> 8
|
||||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
|
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(dst_argb), // %1
|
"+r"(dst_argb), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
:
|
:
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Quantize 8 ARGB pixels (32 bytes).
|
// Quantize 8 ARGB pixels (32 bytes).
|
||||||
@ -3980,6 +4021,46 @@ void ConvertFP16ToFP32Row_NEON(const uint16_t* src, // fp16
|
|||||||
: "cc", "memory", "v1", "v2", "v3");
|
: "cc", "memory", "v1", "v2", "v3");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Convert FP16 Half Floats to FP32 Floats
|
||||||
|
// Read a column and write a row
|
||||||
|
void ConvertFP16ToFP32Column_NEON(const uint16_t* src, // fp16
|
||||||
|
int src_stride, // stride in elements
|
||||||
|
float* dst,
|
||||||
|
int width) {
|
||||||
|
asm volatile(
|
||||||
|
"cmp %w2, #8 \n" // Is there 8 rows?
|
||||||
|
"b.lo 2f \n"
|
||||||
|
"1: \n"
|
||||||
|
"ld1 {v0.h}[0], [%0], %3 \n" // load 8 halffloats
|
||||||
|
"ld1 {v0.h}[1], [%0], %3 \n"
|
||||||
|
"ld1 {v0.h}[2], [%0], %3 \n"
|
||||||
|
"ld1 {v0.h}[3], [%0], %3 \n"
|
||||||
|
"ld1 {v1.h}[0], [%0], %3 \n"
|
||||||
|
"ld1 {v1.h}[1], [%0], %3 \n"
|
||||||
|
"ld1 {v1.h}[2], [%0], %3 \n"
|
||||||
|
"ld1 {v1.h}[3], [%0], %3 \n"
|
||||||
|
"subs %w2, %w2, #8 \n" // 8 rows per loop
|
||||||
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
|
"fcvtl v2.4s, v0.4h \n" // 4 floats
|
||||||
|
"fcvtl v3.4s, v1.4h \n" // 4 more floats
|
||||||
|
"stp q2, q3, [%1], #32 \n" // store 8 floats
|
||||||
|
"b.gt 1b \n"
|
||||||
|
"cmp %w2, #1 \n" // Is there 1 value?
|
||||||
|
"b.lo 3f \n"
|
||||||
|
"2: \n"
|
||||||
|
"ld1 {v1.h}[0], [%0], %3 \n" // load 1 halffloats
|
||||||
|
"subs %w2, %w2, #1 \n" // 1 floats per loop
|
||||||
|
"fcvtl v2.4s, v1.4h \n" // 1 floats
|
||||||
|
"str s2, [%1], #4 \n" // store 1 floats
|
||||||
|
"b.gt 2b \n"
|
||||||
|
"3: \n"
|
||||||
|
: "+r"(src), // %0
|
||||||
|
"+r"(dst), // %1
|
||||||
|
"+r"(width) // %2
|
||||||
|
: "r"((ptrdiff_t)(src_stride * 2)) // %3
|
||||||
|
: "cc", "memory", "v0", "v1", "v2", "v3");
|
||||||
|
}
|
||||||
|
|
||||||
// Convert FP32 Floats to FP16 Half Floats
|
// Convert FP32 Floats to FP16 Half Floats
|
||||||
void ConvertFP32ToFP16Row_NEON(const float* src,
|
void ConvertFP32ToFP16Row_NEON(const float* src,
|
||||||
uint16_t* dst, // fp16
|
uint16_t* dst, // fp16
|
||||||
|
|||||||
@ -75,7 +75,6 @@ void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
|
|||||||
asm volatile("csrwi vxrm, 0");
|
asm volatile("csrwi vxrm, 0");
|
||||||
do {
|
do {
|
||||||
vuint8m4_t v_odd, v_even, v_dst;
|
vuint8m4_t v_odd, v_even, v_dst;
|
||||||
vuint16m8_t v_sum;
|
|
||||||
vuint32m4_t v_odd_32, v_even_32;
|
vuint32m4_t v_odd_32, v_even_32;
|
||||||
size_t vl = __riscv_vsetvl_e32m4(w);
|
size_t vl = __riscv_vsetvl_e32m4(w);
|
||||||
__riscv_vlseg2e32_v_u32m4(&v_even_32, &v_odd_32, src, vl);
|
__riscv_vlseg2e32_v_u32m4(&v_even_32, &v_odd_32, src, vl);
|
||||||
@ -499,7 +498,7 @@ void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv,
|
|||||||
vuint8m4_t v_u0v0, v_u1v1, v_avg;
|
vuint8m4_t v_u0v0, v_u1v1, v_avg;
|
||||||
vuint16m4_t v_u0v0_16, v_u1v1_16;
|
vuint16m4_t v_u0v0_16, v_u1v1_16;
|
||||||
size_t vl = __riscv_vsetvl_e16m4(w);
|
size_t vl = __riscv_vsetvl_e16m4(w);
|
||||||
vlseg2e16_v_u16m4(&v_u0v0_16, &v_u1v1_16, src, vl);
|
__riscv_vlseg2e16_v_u16m4(&v_u0v0_16, &v_u1v1_16, src, vl);
|
||||||
v_u0v0 = __riscv_vreinterpret_v_u16m4_u8m4(v_u0v0_16);
|
v_u0v0 = __riscv_vreinterpret_v_u16m4_u8m4(v_u0v0_16);
|
||||||
v_u1v1 = __riscv_vreinterpret_v_u16m4_u8m4(v_u1v1_16);
|
v_u1v1 = __riscv_vreinterpret_v_u16m4_u8m4(v_u1v1_16);
|
||||||
// Use round-to-nearest-up mode for averaging add
|
// Use round-to-nearest-up mode for averaging add
|
||||||
|
|||||||
@ -30,9 +30,9 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(LIBYUV_BIT_EXACT)
|
#if defined(LIBYUV_BIT_EXACT)
|
||||||
#define EXPECTED_ATTENUATE_DIFF 0
|
#define EXPECTED_UNATTENUATE_DIFF 0
|
||||||
#else
|
#else
|
||||||
#define EXPECTED_ATTENUATE_DIFF 2
|
#define EXPECTED_UNATTENUATE_DIFF 2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace libyuv {
|
namespace libyuv {
|
||||||
@ -57,12 +57,17 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
|
|||||||
orig_pixels[2 * 4 + 0] = 16u;
|
orig_pixels[2 * 4 + 0] = 16u;
|
||||||
orig_pixels[2 * 4 + 1] = 64u;
|
orig_pixels[2 * 4 + 1] = 64u;
|
||||||
orig_pixels[2 * 4 + 2] = 192u;
|
orig_pixels[2 * 4 + 2] = 192u;
|
||||||
orig_pixels[2 * 4 + 3] = 255u;
|
orig_pixels[2 * 4 + 3] = 128u;
|
||||||
orig_pixels[3 * 4 + 0] = 16u;
|
orig_pixels[3 * 4 + 0] = 16u;
|
||||||
orig_pixels[3 * 4 + 1] = 64u;
|
orig_pixels[3 * 4 + 1] = 64u;
|
||||||
orig_pixels[3 * 4 + 2] = 192u;
|
orig_pixels[3 * 4 + 2] = 192u;
|
||||||
orig_pixels[3 * 4 + 3] = 128u;
|
orig_pixels[3 * 4 + 3] = 255u;
|
||||||
ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 4, 1);
|
orig_pixels[4 * 4 + 0] = 255u;
|
||||||
|
orig_pixels[4 * 4 + 1] = 255u;
|
||||||
|
orig_pixels[4 * 4 + 2] = 255u;
|
||||||
|
orig_pixels[4 * 4 + 3] = 255u;
|
||||||
|
|
||||||
|
ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 5, 1);
|
||||||
EXPECT_EQ(255u, unatten_pixels[0 * 4 + 0]);
|
EXPECT_EQ(255u, unatten_pixels[0 * 4 + 0]);
|
||||||
EXPECT_EQ(255u, unatten_pixels[0 * 4 + 1]);
|
EXPECT_EQ(255u, unatten_pixels[0 * 4 + 1]);
|
||||||
EXPECT_EQ(254u, unatten_pixels[0 * 4 + 2]);
|
EXPECT_EQ(254u, unatten_pixels[0 * 4 + 2]);
|
||||||
@ -71,14 +76,55 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
|
|||||||
EXPECT_EQ(0u, unatten_pixels[1 * 4 + 1]);
|
EXPECT_EQ(0u, unatten_pixels[1 * 4 + 1]);
|
||||||
EXPECT_EQ(0u, unatten_pixels[1 * 4 + 2]);
|
EXPECT_EQ(0u, unatten_pixels[1 * 4 + 2]);
|
||||||
EXPECT_EQ(0u, unatten_pixels[1 * 4 + 3]);
|
EXPECT_EQ(0u, unatten_pixels[1 * 4 + 3]);
|
||||||
EXPECT_EQ(16u, unatten_pixels[2 * 4 + 0]);
|
EXPECT_EQ(32u, unatten_pixels[2 * 4 + 0]);
|
||||||
EXPECT_EQ(64u, unatten_pixels[2 * 4 + 1]);
|
EXPECT_EQ(128u, unatten_pixels[2 * 4 + 1]);
|
||||||
EXPECT_EQ(192u, unatten_pixels[2 * 4 + 2]);
|
EXPECT_EQ(255u, unatten_pixels[2 * 4 + 2]);
|
||||||
EXPECT_EQ(255u, unatten_pixels[2 * 4 + 3]);
|
EXPECT_EQ(128u, unatten_pixels[2 * 4 + 3]);
|
||||||
EXPECT_EQ(32u, unatten_pixels[3 * 4 + 0]);
|
EXPECT_EQ(16u, unatten_pixels[3 * 4 + 0]);
|
||||||
EXPECT_EQ(128u, unatten_pixels[3 * 4 + 1]);
|
EXPECT_EQ(64u, unatten_pixels[3 * 4 + 1]);
|
||||||
EXPECT_EQ(255u, unatten_pixels[3 * 4 + 2]);
|
EXPECT_EQ(192u, unatten_pixels[3 * 4 + 2]);
|
||||||
EXPECT_EQ(128u, unatten_pixels[3 * 4 + 3]);
|
EXPECT_EQ(255u, unatten_pixels[3 * 4 + 3]);
|
||||||
|
EXPECT_EQ(255u, unatten_pixels[4 * 4 + 0]);
|
||||||
|
EXPECT_EQ(255u, unatten_pixels[4 * 4 + 1]);
|
||||||
|
EXPECT_EQ(255u, unatten_pixels[4 * 4 + 2]);
|
||||||
|
EXPECT_EQ(255u, unatten_pixels[4 * 4 + 3]);
|
||||||
|
|
||||||
|
ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 5, 1);
|
||||||
|
EXPECT_EQ(100u, atten_pixels[0 * 4 + 0]);
|
||||||
|
EXPECT_EQ(65u, atten_pixels[0 * 4 + 1]);
|
||||||
|
EXPECT_EQ(64u, atten_pixels[0 * 4 + 2]);
|
||||||
|
EXPECT_EQ(128u, atten_pixels[0 * 4 + 3]);
|
||||||
|
EXPECT_EQ(0u, atten_pixels[1 * 4 + 0]);
|
||||||
|
EXPECT_EQ(0u, atten_pixels[1 * 4 + 1]);
|
||||||
|
EXPECT_EQ(0u, atten_pixels[1 * 4 + 2]);
|
||||||
|
EXPECT_EQ(0u, atten_pixels[1 * 4 + 3]);
|
||||||
|
EXPECT_EQ(8u, atten_pixels[2 * 4 + 0]);
|
||||||
|
EXPECT_EQ(32u, atten_pixels[2 * 4 + 1]);
|
||||||
|
EXPECT_EQ(96u, atten_pixels[2 * 4 + 2]);
|
||||||
|
EXPECT_EQ(128u, atten_pixels[2 * 4 + 3]);
|
||||||
|
EXPECT_EQ(16u, atten_pixels[3 * 4 + 0]);
|
||||||
|
EXPECT_EQ(64u, atten_pixels[3 * 4 + 1]);
|
||||||
|
EXPECT_EQ(192u, atten_pixels[3 * 4 + 2]);
|
||||||
|
EXPECT_EQ(255u, atten_pixels[3 * 4 + 3]);
|
||||||
|
EXPECT_EQ(255u, atten_pixels[4 * 4 + 0]);
|
||||||
|
EXPECT_EQ(255u, atten_pixels[4 * 4 + 1]);
|
||||||
|
EXPECT_EQ(255u, atten_pixels[4 * 4 + 2]);
|
||||||
|
EXPECT_EQ(255u, atten_pixels[4 * 4 + 3]);
|
||||||
|
|
||||||
|
// test 255
|
||||||
|
for (int i = 0; i < 256; ++i) {
|
||||||
|
orig_pixels[i * 4 + 0] = i;
|
||||||
|
orig_pixels[i * 4 + 1] = 0;
|
||||||
|
orig_pixels[i * 4 + 2] = 0;
|
||||||
|
orig_pixels[i * 4 + 3] = 255;
|
||||||
|
}
|
||||||
|
ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 256, 1);
|
||||||
|
for (int i = 0; i < 256; ++i) {
|
||||||
|
EXPECT_EQ(orig_pixels[i * 4 + 0], atten_pixels[i * 4 + 0]);
|
||||||
|
EXPECT_EQ(0, atten_pixels[i * 4 + 1]);
|
||||||
|
EXPECT_EQ(0, atten_pixels[i * 4 + 2]);
|
||||||
|
EXPECT_EQ(255, atten_pixels[i * 4 + 3]);
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < 1280; ++i) {
|
for (int i = 0; i < 1280; ++i) {
|
||||||
orig_pixels[i * 4 + 0] = i;
|
orig_pixels[i * 4 + 0] = i;
|
||||||
@ -92,10 +138,10 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
|
|||||||
ARGBAttenuate(unatten_pixels, 0, atten2_pixels, 0, 1280, 1);
|
ARGBAttenuate(unatten_pixels, 0, atten2_pixels, 0, 1280, 1);
|
||||||
}
|
}
|
||||||
for (int i = 0; i < 1280; ++i) {
|
for (int i = 0; i < 1280; ++i) {
|
||||||
EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 2);
|
EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 1);
|
||||||
EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 2);
|
EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 1);
|
||||||
EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 2);
|
EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 1);
|
||||||
EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 2);
|
EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 1);
|
||||||
}
|
}
|
||||||
// Make sure transparent, 50% and opaque are fully accurate.
|
// Make sure transparent, 50% and opaque are fully accurate.
|
||||||
EXPECT_EQ(0, atten_pixels[0 * 4 + 0]);
|
EXPECT_EQ(0, atten_pixels[0 * 4 + 0]);
|
||||||
@ -106,9 +152,9 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
|
|||||||
EXPECT_EQ(32, atten_pixels[128 * 4 + 1]);
|
EXPECT_EQ(32, atten_pixels[128 * 4 + 1]);
|
||||||
EXPECT_EQ(21, atten_pixels[128 * 4 + 2]);
|
EXPECT_EQ(21, atten_pixels[128 * 4 + 2]);
|
||||||
EXPECT_EQ(128, atten_pixels[128 * 4 + 3]);
|
EXPECT_EQ(128, atten_pixels[128 * 4 + 3]);
|
||||||
EXPECT_NEAR(254, atten_pixels[255 * 4 + 0], EXPECTED_ATTENUATE_DIFF);
|
EXPECT_EQ(255, atten_pixels[255 * 4 + 0]);
|
||||||
EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], EXPECTED_ATTENUATE_DIFF);
|
EXPECT_EQ(127, atten_pixels[255 * 4 + 1]);
|
||||||
EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], EXPECTED_ATTENUATE_DIFF);
|
EXPECT_EQ(85, atten_pixels[255 * 4 + 2]);
|
||||||
EXPECT_EQ(255, atten_pixels[255 * 4 + 3]);
|
EXPECT_EQ(255, atten_pixels[255 * 4 + 3]);
|
||||||
|
|
||||||
free_aligned_buffer_page_end(atten2_pixels);
|
free_aligned_buffer_page_end(atten2_pixels);
|
||||||
@ -165,28 +211,28 @@ TEST_F(LibYUVPlanarTest, ARGBAttenuate_Any) {
|
|||||||
benchmark_iterations_, disable_cpu_flags_,
|
benchmark_iterations_, disable_cpu_flags_,
|
||||||
benchmark_cpu_info_, +1, 0);
|
benchmark_cpu_info_, +1, 0);
|
||||||
|
|
||||||
EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
|
EXPECT_EQ(max_diff, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(LibYUVPlanarTest, ARGBAttenuate_Unaligned) {
|
TEST_F(LibYUVPlanarTest, ARGBAttenuate_Unaligned) {
|
||||||
int max_diff =
|
int max_diff =
|
||||||
TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
|
TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
|
||||||
disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
|
disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
|
||||||
EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
|
EXPECT_EQ(max_diff, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(LibYUVPlanarTest, ARGBAttenuate_Invert) {
|
TEST_F(LibYUVPlanarTest, ARGBAttenuate_Invert) {
|
||||||
int max_diff =
|
int max_diff =
|
||||||
TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
|
TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
|
||||||
disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
|
disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
|
||||||
EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
|
EXPECT_EQ(max_diff, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(LibYUVPlanarTest, ARGBAttenuate_Opt) {
|
TEST_F(LibYUVPlanarTest, ARGBAttenuate_Opt) {
|
||||||
int max_diff =
|
int max_diff =
|
||||||
TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
|
TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
|
||||||
disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
|
disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
|
||||||
EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
|
EXPECT_EQ(max_diff, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int TestUnattenuateI(int width,
|
static int TestUnattenuateI(int width,
|
||||||
@ -238,28 +284,28 @@ TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) {
|
|||||||
int max_diff = TestUnattenuateI(benchmark_width_ + 1, benchmark_height_,
|
int max_diff = TestUnattenuateI(benchmark_width_ + 1, benchmark_height_,
|
||||||
benchmark_iterations_, disable_cpu_flags_,
|
benchmark_iterations_, disable_cpu_flags_,
|
||||||
benchmark_cpu_info_, +1, 0);
|
benchmark_cpu_info_, +1, 0);
|
||||||
EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
|
EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) {
|
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) {
|
||||||
int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
|
int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
|
||||||
benchmark_iterations_, disable_cpu_flags_,
|
benchmark_iterations_, disable_cpu_flags_,
|
||||||
benchmark_cpu_info_, +1, 1);
|
benchmark_cpu_info_, +1, 1);
|
||||||
EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
|
EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) {
|
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) {
|
||||||
int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
|
int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
|
||||||
benchmark_iterations_, disable_cpu_flags_,
|
benchmark_iterations_, disable_cpu_flags_,
|
||||||
benchmark_cpu_info_, -1, 0);
|
benchmark_cpu_info_, -1, 0);
|
||||||
EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
|
EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) {
|
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) {
|
||||||
int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
|
int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
|
||||||
benchmark_iterations_, disable_cpu_flags_,
|
benchmark_iterations_, disable_cpu_flags_,
|
||||||
benchmark_cpu_info_, +1, 0);
|
benchmark_cpu_info_, +1, 0);
|
||||||
EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
|
EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
|
TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
|
||||||
@ -2764,8 +2810,8 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
|
|||||||
}
|
}
|
||||||
opt_time = (get_time() - opt_time) / benchmark_iterations_;
|
opt_time = (get_time() - opt_time) / benchmark_iterations_;
|
||||||
// Report performance of C vs OPT
|
// Report performance of C vs OPT
|
||||||
printf("%8d us C - %8d us OPT\n",
|
printf("%8d us C - %8d us OPT\n", static_cast<int>(c_time * 1e6),
|
||||||
static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
|
static_cast<int>(opt_time * 1e6));
|
||||||
for (int i = 0; i < kPixels; ++i) {
|
for (int i = 0; i < kPixels; ++i) {
|
||||||
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
||||||
}
|
}
|
||||||
@ -2804,8 +2850,8 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) {
|
|||||||
opt_time = (get_time() - opt_time) / benchmark_iterations_;
|
opt_time = (get_time() - opt_time) / benchmark_iterations_;
|
||||||
|
|
||||||
// Report performance of C vs OPT
|
// Report performance of C vs OPT
|
||||||
printf("%8d us C - %8d us OPT\n",
|
printf("%8d us C - %8d us OPT\n", static_cast<int>(c_time * 1e6),
|
||||||
static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
|
static_cast<int>(opt_time * 1e6));
|
||||||
for (int i = 0; i < kPixels * 4; ++i) {
|
for (int i = 0; i < kPixels * 4; ++i) {
|
||||||
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
||||||
}
|
}
|
||||||
@ -4531,6 +4577,43 @@ TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32) {
|
|||||||
free_aligned_buffer_page_end(rec_opt);
|
free_aligned_buffer_page_end(rec_opt);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32Column) {
|
||||||
|
int i, j;
|
||||||
|
const int y_plane_size = benchmark_width_ * benchmark_height_;
|
||||||
|
|
||||||
|
align_buffer_page_end(orig_f, y_plane_size * 4);
|
||||||
|
align_buffer_page_end(orig_y, y_plane_size * 2);
|
||||||
|
align_buffer_page_end(dst_opt, y_plane_size * 4);
|
||||||
|
align_buffer_page_end(rec_opt, y_plane_size * 2);
|
||||||
|
|
||||||
|
for (i = 0; i < y_plane_size; ++i) {
|
||||||
|
((float*)orig_f)[i] = (float)(i % 10000) * 3.14f;
|
||||||
|
}
|
||||||
|
memset(orig_y, 1, y_plane_size * 2);
|
||||||
|
memset(dst_opt, 2, y_plane_size * 4);
|
||||||
|
memset(rec_opt, 3, y_plane_size * 2);
|
||||||
|
|
||||||
|
ConvertFP32ToFP16Row_NEON((const float*)orig_f, (uint16_t*)orig_y,
|
||||||
|
y_plane_size);
|
||||||
|
|
||||||
|
for (j = 0; j < benchmark_iterations_; j++) {
|
||||||
|
ConvertFP16ToFP32Column_NEON((const uint16_t*)orig_y, 1, (float*)dst_opt,
|
||||||
|
y_plane_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
ConvertFP32ToFP16Row_NEON((const float*)dst_opt, (uint16_t*)rec_opt,
|
||||||
|
y_plane_size);
|
||||||
|
|
||||||
|
for (i = 0; i < y_plane_size; ++i) {
|
||||||
|
EXPECT_EQ(((const uint16_t*)orig_y)[i], ((const uint16_t*)rec_opt)[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
free_aligned_buffer_page_end(orig_f);
|
||||||
|
free_aligned_buffer_page_end(orig_y);
|
||||||
|
free_aligned_buffer_page_end(dst_opt);
|
||||||
|
free_aligned_buffer_page_end(rec_opt);
|
||||||
|
}
|
||||||
|
|
||||||
#endif // defined(ENABLE_ROW_TESTS) && defined(__aarch64__)
|
#endif // defined(ENABLE_ROW_TESTS) && defined(__aarch64__)
|
||||||
|
|
||||||
} // namespace libyuv
|
} // namespace libyuv
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user