mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 09:47:45 +08:00
Shade ported to Neon
BUG=167 TESTED=planar test Shade* Review URL: https://webrtc-codereview.appspot.com/969014 git-svn-id: http://libyuv.googlecode.com/svn/trunk@509 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
ef60ab0db4
commit
b94b139e86
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 508
|
||||
Version: 509
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -223,6 +223,7 @@ extern "C" {
|
||||
#define HAS_ARGBBLENDROW_NEON
|
||||
#define HAS_ARGBATTENUATEROW_NEON
|
||||
#define HAS_ARGBQUANTIZEROW_NEON
|
||||
#define HAS_ARGBSHADEROW_NEON
|
||||
#endif
|
||||
|
||||
// The following are available on Mips platforms
|
||||
@ -1250,6 +1251,8 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
uint32 value);
|
||||
void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
uint32 value);
|
||||
void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
uint32 value);
|
||||
|
||||
LIBYUV_API
|
||||
void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 508
|
||||
#define LIBYUV_VERSION 509
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -1133,6 +1133,10 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
|
||||
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
|
||||
ARGBShadeRow = ARGBShadeRow_SSE2;
|
||||
}
|
||||
#elif defined(HAS_ARGBSHADEROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
|
||||
ARGBShadeRow = ARGBShadeRow_NEON;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
|
||||
@ -665,6 +665,32 @@ void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
|
||||
}
|
||||
}
|
||||
|
||||
#define REPEAT8(v) (v) | ((v) << 8)
|
||||
#define SHADE(f, v) v * f >> 24
|
||||
|
||||
void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
uint32 value) {
|
||||
const uint32 b_scale = REPEAT8(value & 0xff);
|
||||
const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
|
||||
const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
|
||||
const uint32 a_scale = REPEAT8(value >> 24);
|
||||
|
||||
for (int i = 0; i < width; ++i) {
|
||||
const uint32 b = REPEAT8(src_argb[0]);
|
||||
const uint32 g = REPEAT8(src_argb[1]);
|
||||
const uint32 r = REPEAT8(src_argb[2]);
|
||||
const uint32 a = REPEAT8(src_argb[3]);
|
||||
dst_argb[0] = SHADE(b, b_scale);
|
||||
dst_argb[1] = SHADE(g, g_scale);
|
||||
dst_argb[2] = SHADE(r, r_scale);
|
||||
dst_argb[3] = SHADE(a, a_scale);
|
||||
src_argb += 4;
|
||||
dst_argb += 4;
|
||||
}
|
||||
}
|
||||
#undef REPEAT8
|
||||
#undef SHADE
|
||||
|
||||
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
|
||||
// Copy a Y to RGB.
|
||||
for (int x = 0; x < width; ++x) {
|
||||
@ -1512,32 +1538,6 @@ void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
|
||||
}
|
||||
}
|
||||
|
||||
#define REPEAT8(v) (v) | ((v) << 8)
|
||||
#define SHADE(f, v) v * f >> 24
|
||||
|
||||
void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
uint32 value) {
|
||||
const uint32 b_scale = REPEAT8(value & 0xff);
|
||||
const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
|
||||
const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
|
||||
const uint32 a_scale = REPEAT8(value >> 24);
|
||||
|
||||
for (int i = 0; i < width; ++i) {
|
||||
const uint32 b = REPEAT8(src_argb[0]);
|
||||
const uint32 g = REPEAT8(src_argb[1]);
|
||||
const uint32 r = REPEAT8(src_argb[2]);
|
||||
const uint32 a = REPEAT8(src_argb[3]);
|
||||
dst_argb[0] = SHADE(b, b_scale);
|
||||
dst_argb[1] = SHADE(g, g_scale);
|
||||
dst_argb[2] = SHADE(r, r_scale);
|
||||
dst_argb[3] = SHADE(a, a_scale);
|
||||
src_argb += 4;
|
||||
dst_argb += 4;
|
||||
}
|
||||
}
|
||||
#undef REPEAT8
|
||||
#undef SHADE
|
||||
|
||||
// Copy pixels from rotated source to destination row with a slope.
|
||||
LIBYUV_API
|
||||
void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
|
||||
|
||||
@ -2510,7 +2510,43 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
|
||||
: "r"(scale), // %2
|
||||
"r"(interval_size), // %3
|
||||
"r"(interval_offset) // %4
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
|
||||
);
|
||||
}
|
||||
|
||||
// Shade 8 pixels at a time by specified value.
|
||||
// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
|
||||
void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
uint32 value) {
|
||||
asm volatile (
|
||||
"vdup.u32 q0, %3 \n" // duplicate scale value.
|
||||
"vtrn.u8 d0, d1 \n" // d0 rrbb, d1 aagg
|
||||
"vshr.u16 q0, q0, #1 \n" // scale >>= 1
|
||||
|
||||
// 8 pixel loop.
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmovl.u8 q10, d20 \n" // b (0 .. 255)
|
||||
"vmovl.u8 q11, d22 \n"
|
||||
"vmovl.u8 q12, d24 \n"
|
||||
"vmovl.u8 q13, d26 \n"
|
||||
"vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale
|
||||
"vqrdmulh.s16 q11, q11, d1[0] \n" // g
|
||||
"vqrdmulh.s16 q12, q12, d0[1] \n" // r
|
||||
"vqrdmulh.s16 q13, q13, d1[0] \n" // a
|
||||
"vqmovn.u16 d20, q10 \n"
|
||||
"vqmovn.u16 d22, q11 \n"
|
||||
"vqmovn.u16 d24, q12 \n"
|
||||
"vqmovn.u16 d26, q13 \n"
|
||||
"vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(value) // %3
|
||||
: "cc", "memory", "q0", "q10", "q11", "q12", "q13"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@ -3921,6 +3921,45 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
|
||||
}
|
||||
#endif // HAS_ARGBQUANTIZEROW_SSE2
|
||||
|
||||
#ifdef HAS_ARGBSHADEROW_SSE2
|
||||
// Shade 4 pixels at a time by specified value.
|
||||
// Aligned to 16 bytes.
|
||||
void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
uint32 value) {
|
||||
asm volatile (
|
||||
"movd %3,%%xmm2 \n"
|
||||
"sub %0,%1 \n"
|
||||
"punpcklbw %%xmm2,%%xmm2 \n"
|
||||
"punpcklqdq %%xmm2,%%xmm2 \n"
|
||||
|
||||
// 4 pixel loop.
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"punpcklbw %%xmm0,%%xmm0 \n"
|
||||
"punpckhbw %%xmm1,%%xmm1 \n"
|
||||
"pmulhuw %%xmm2,%%xmm0 \n"
|
||||
"pmulhuw %%xmm2,%%xmm1 \n"
|
||||
"psrlw $0x8,%%xmm0 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"movdqa %%xmm0,(%0,%1,1) \n"
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(value) // %3
|
||||
: "memory", "cc"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1", "xmm2"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif // HAS_ARGBSHADEROW_SSE2
|
||||
|
||||
#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
|
||||
// Creates a table of cumulative sums where each value is a sum of all values
|
||||
// above and to the left of the value, inclusive of the value.
|
||||
@ -4091,44 +4130,6 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
|
||||
);
|
||||
}
|
||||
#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
|
||||
#ifdef HAS_ARGBSHADEROW_SSE2
|
||||
// Shade 4 pixels at a time by specified value.
|
||||
// Aligned to 16 bytes.
|
||||
void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
uint32 value) {
|
||||
asm volatile (
|
||||
"movd %3,%%xmm2 \n"
|
||||
"sub %0,%1 \n"
|
||||
"punpcklbw %%xmm2,%%xmm2 \n"
|
||||
"punpcklqdq %%xmm2,%%xmm2 \n"
|
||||
|
||||
// 4 pixel loop.
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"punpcklbw %%xmm0,%%xmm0 \n"
|
||||
"punpckhbw %%xmm1,%%xmm1 \n"
|
||||
"pmulhuw %%xmm2,%%xmm0 \n"
|
||||
"pmulhuw %%xmm2,%%xmm1 \n"
|
||||
"psrlw $0x8,%%xmm0 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"movdqa %%xmm0,(%0,%1,1) \n"
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(value) // %3
|
||||
: "memory", "cc"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1", "xmm2"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif // HAS_ARGBSHADEROW_SSE2
|
||||
|
||||
#ifdef HAS_ARGBAFFINEROW_SSE2
|
||||
// TODO(fbarchard): Find 64 bit way to avoid masking.
|
||||
|
||||
@ -4124,6 +4124,42 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
|
||||
}
|
||||
#endif // HAS_ARGBQUANTIZEROW_SSE2
|
||||
|
||||
#ifdef HAS_ARGBSHADEROW_SSE2
|
||||
// Shade 4 pixels at a time by specified value.
|
||||
// Aligned to 16 bytes.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
uint32 value) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_argb
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // width
|
||||
movd xmm2, [esp + 16] // value
|
||||
sub edx, eax
|
||||
punpcklbw xmm2, xmm2
|
||||
punpcklqdq xmm2, xmm2
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
movdqa xmm0, [eax] // read 4 pixels
|
||||
movdqa xmm1, xmm0
|
||||
punpcklbw xmm0, xmm0 // first 2
|
||||
punpckhbw xmm1, xmm1 // next 2
|
||||
pmulhuw xmm0, xmm2 // argb * value
|
||||
pmulhuw xmm1, xmm2 // argb * value
|
||||
psrlw xmm0, 8
|
||||
psrlw xmm1, 8
|
||||
packuswb xmm0, xmm1
|
||||
sub ecx, 4
|
||||
movdqa [eax + edx], xmm0
|
||||
lea eax, [eax + 16]
|
||||
jg convertloop
|
||||
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBSHADEROW_SSE2
|
||||
|
||||
#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
|
||||
// Consider float CumulativeSum.
|
||||
// Consider calling CumulativeSum one row at time as needed.
|
||||
@ -4315,42 +4351,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
|
||||
}
|
||||
#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
|
||||
|
||||
#ifdef HAS_ARGBSHADEROW_SSE2
|
||||
// Shade 4 pixels at a time by specified value.
|
||||
// Aligned to 16 bytes.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
uint32 value) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_argb
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // width
|
||||
movd xmm2, [esp + 16] // value
|
||||
sub edx, eax
|
||||
punpcklbw xmm2, xmm2
|
||||
punpcklqdq xmm2, xmm2
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
movdqa xmm0, [eax] // read 4 pixels
|
||||
movdqa xmm1, xmm0
|
||||
punpcklbw xmm0, xmm0 // first 2
|
||||
punpckhbw xmm1, xmm1 // next 2
|
||||
pmulhuw xmm0, xmm2 // argb * value
|
||||
pmulhuw xmm1, xmm2 // argb * value
|
||||
psrlw xmm0, 8
|
||||
psrlw xmm1, 8
|
||||
packuswb xmm0, xmm1
|
||||
sub ecx, 4
|
||||
movdqa [eax + edx], xmm0
|
||||
lea eax, [eax + 16]
|
||||
jg convertloop
|
||||
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBSHADEROW_SSE2
|
||||
|
||||
#ifdef HAS_ARGBAFFINEROW_SSE2
|
||||
// Copy ARGB pixels from source image with slope to a row of destination.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
|
||||
@ -539,7 +539,8 @@ TEST_F(libyuvTest, TestShade) {
|
||||
orig_pixels[3][1] = 0u;
|
||||
orig_pixels[3][2] = 0u;
|
||||
orig_pixels[3][3] = 0u;
|
||||
ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 4, 1, 0x80ffffff);
|
||||
// Do 8 pixels to allow opt version to be used.
|
||||
ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x80ffffff);
|
||||
EXPECT_EQ(10u, shade_pixels[0][0]);
|
||||
EXPECT_EQ(20u, shade_pixels[0][1]);
|
||||
EXPECT_EQ(40u, shade_pixels[0][2]);
|
||||
@ -557,12 +558,18 @@ TEST_F(libyuvTest, TestShade) {
|
||||
EXPECT_EQ(0u, shade_pixels[3][2]);
|
||||
EXPECT_EQ(0u, shade_pixels[3][3]);
|
||||
|
||||
ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 4, 1, 0x80808080);
|
||||
ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x80808080);
|
||||
EXPECT_EQ(5u, shade_pixels[0][0]);
|
||||
EXPECT_EQ(10u, shade_pixels[0][1]);
|
||||
EXPECT_EQ(20u, shade_pixels[0][2]);
|
||||
EXPECT_EQ(40u, shade_pixels[0][3]);
|
||||
|
||||
ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x10204080);
|
||||
EXPECT_EQ(5u, shade_pixels[0][0]);
|
||||
EXPECT_EQ(5u, shade_pixels[0][1]);
|
||||
EXPECT_EQ(5u, shade_pixels[0][2]);
|
||||
EXPECT_EQ(5u, shade_pixels[0][3]);
|
||||
|
||||
for (int i = 0; i < benchmark_pixels_div256_; ++i) {
|
||||
ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 256, 1,
|
||||
0x80808080);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user