Attenuate AGRB pixels NEON optimized

BUG=164
TEST=./libyuv_unittest --gtest_filter=*Atten*
Review URL: https://webrtc-codereview.appspot.com/937031

git-svn-id: http://libyuv.googlecode.com/svn/trunk@506 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
fbarchard@google.com 2012-11-28 20:02:55 +00:00
parent 326a521aba
commit 1d160cb99f
18 changed files with 280 additions and 123 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 505 Version: 506
License: BSD License: BSD
License File: LICENSE License File: LICENSE

View File

@ -53,7 +53,7 @@ int ArmCpuCaps(const char* cpuinfo_name);
// returns non-zero if instruction set is detected // returns non-zero if instruction set is detected
static __inline int TestCpuFlag(int test_flag) { static __inline int TestCpuFlag(int test_flag) {
LIBYUV_API extern int cpu_info_; LIBYUV_API extern int cpu_info_;
return (cpu_info_ == 1 ? InitCpuFlags() : cpu_info_) & test_flag; return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;
} }
// For testing, allow CPU flags to be disabled. // For testing, allow CPU flags to be disabled.

View File

@ -139,7 +139,7 @@ extern "C" {
#if !defined(YUV_DISABLE_ASM) && \ #if !defined(YUV_DISABLE_ASM) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
!defined(LIBYUV_SSSE3_ONLY) !defined(LIBYUV_SSSE3_ONLY)
#define HAS_ARGBATTENUATE_SSE2 #define HAS_ARGBATTENUATEROW_SSE2
#define HAS_ARGBBLENDROW_SSE2 #define HAS_ARGBBLENDROW_SSE2
#define HAS_MIRRORROW_SSE2 #define HAS_MIRRORROW_SSE2
#endif #endif
@ -221,6 +221,7 @@ extern "C" {
// Effects // Effects
#define HAS_ARGBINTERPOLATEROW_NEON #define HAS_ARGBINTERPOLATEROW_NEON
#define HAS_ARGBBLENDROW_NEON #define HAS_ARGBBLENDROW_NEON
#define HAS_ARGBATTENUATEROW_NEON
#endif #endif
// The following are available on Mips platforms // The following are available on Mips platforms
@ -935,6 +936,12 @@ void YToARGBRow_SSE2(const uint8* src_y,
void YToARGBRow_NEON(const uint8* src_y, void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void YToARGBRow_Any_SSE2(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_Any_NEON(const uint8* src_y,
uint8* dst_argb,
int width);
// ARGB preattenuated alpha blend. // ARGB preattenuated alpha blend.
void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1, void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
@ -1194,6 +1201,13 @@ void I422ToUYVYRow_Any_NEON(const uint8* src_y,
void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
int width);
void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
int width);
void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
int width);
// Inverse table for unattenuate, shared by C and SSE2. // Inverse table for unattenuate, shared by C and SSE2.
extern uint32 fixed_invtbl8[256]; extern uint32 fixed_invtbl8[256];

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 505 #define LIBYUV_VERSION 506
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT

View File

@ -18,6 +18,7 @@
#endif #endif
#include "libyuv/planar_functions.h" #include "libyuv/planar_functions.h"
#include "libyuv/rotate.h" #include "libyuv/rotate.h"
#include "libyuv/scale.h" // For ScalePlane()
#include "libyuv/video_common.h" #include "libyuv/video_common.h"
#include "libyuv/row.h" #include "libyuv/row.h"
@ -215,12 +216,7 @@ int I444ToI420(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
// use Bilinear for upsampling chroma // TODO(fbarchard): Enable bilinear when fast enough or specialized upsampler.
void ScalePlaneBilinear(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr);
// 411 chroma is 1/4 width, 1x height // 411 chroma is 1/4 width, 1x height
// 420 chroma is 1/2 width, 1/2 height // 420 chroma is 1/2 width, 1/2 height
LIBYUV_API LIBYUV_API
@ -256,19 +252,15 @@ int I411ToI420(const uint8* src_y, int src_stride_y,
int halfheight = (height + 1) >> 1; int halfheight = (height + 1) >> 1;
int quarterwidth = (width + 3) >> 2; int quarterwidth = (width + 3) >> 2;
// Resample U plane. // Resample U plane from 1/4 width, 1x height to 1/2 width, 1/2 height.
ScalePlaneBilinear(quarterwidth, height, // from 1/4 width, 1x height ScalePlane(src_u, src_stride_u, quarterwidth, height,
halfwidth, halfheight, // to 1/2 width, 1/2 height dst_u, dst_stride_u, halfwidth, halfheight,
src_stride_u, kFilterNone);
dst_stride_u,
src_u, dst_u);
// Resample V plane. // Resample V plane.
ScalePlaneBilinear(quarterwidth, height, // from 1/4 width, 1x height ScalePlane(src_v, src_stride_v, quarterwidth, height,
halfwidth, halfheight, // to 1/2 width, 1/2 height dst_v, dst_stride_v, halfwidth, halfheight,
src_stride_v, kFilterNone);
dst_stride_v,
src_v, dst_v);
return 0; return 0;
} }
@ -1738,7 +1730,6 @@ static void JpegI400ToI420(void* opaque,
LIBYUV_API LIBYUV_API
int MJPGSize(const uint8* sample, size_t sample_size, int MJPGSize(const uint8* sample, size_t sample_size,
int* width, int* height) { int* width, int* height) {
// TODO(fbarchard): Port to C
MJpegDecoder mjpeg_decoder; MJpegDecoder mjpeg_decoder;
bool ret = mjpeg_decoder.LoadFrame(sample, sample_size); bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
if (ret) { if (ret) {
@ -1764,7 +1755,7 @@ int MJPGToI420(const uint8* sample,
return -1; return -1;
} }
// TODO(fbarchard): Port to C // TODO(fbarchard): Port MJpeg to C.
MJpegDecoder mjpeg_decoder; MJpegDecoder mjpeg_decoder;
bool ret = mjpeg_decoder.LoadFrame(sample, sample_size); bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
if (ret && (mjpeg_decoder.GetWidth() != w || if (ret && (mjpeg_decoder.GetWidth() != w ||

View File

@ -230,14 +230,20 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
uint8* rgb_buf, uint8* rgb_buf,
int width) = YToARGBRow_C; int width) = YToARGBRow_C;
#if defined(HAS_YTOARGBROW_SSE2) #if defined(HAS_YTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8) && if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
YToARGBRow = YToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
YToARGBRow = YToARGBRow_SSE2; YToARGBRow = YToARGBRow_SSE2;
} }
}
#elif defined(HAS_YTOARGBROW_NEON) #elif defined(HAS_YTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
YToARGBRow = YToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
YToARGBRow = YToARGBRow_NEON; YToARGBRow = YToARGBRow_NEON;
} }
}
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
@ -941,7 +947,7 @@ int MJPGToARGB(const uint8* sample,
return -1; return -1;
} }
// TODO(fbarchard): Port to C // TODO(fbarchard): Port MJpeg to C.
MJpegDecoder mjpeg_decoder; MJpegDecoder mjpeg_decoder;
bool ret = mjpeg_decoder.LoadFrame(sample, sample_size); bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
if (ret && (mjpeg_decoder.GetWidth() != w || if (ret && (mjpeg_decoder.GetWidth() != w ||

View File

@ -16,6 +16,7 @@
#include "libyuv/format_conversion.h" #include "libyuv/format_conversion.h"
#include "libyuv/planar_functions.h" #include "libyuv/planar_functions.h"
#include "libyuv/rotate.h" #include "libyuv/rotate.h"
#include "libyuv/scale.h" // For ScalePlane()
#include "libyuv/video_common.h" #include "libyuv/video_common.h"
#include "libyuv/row.h" #include "libyuv/row.h"
@ -98,12 +99,7 @@ int I420ToI422(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
// use Bilinear for upsampling chroma // TODO(fbarchard): Enable bilinear when fast enough or specialized upsampler.
void ScalePlaneBilinear(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr);
LIBYUV_API LIBYUV_API
int I420ToI444(const uint8* src_y, int src_stride_y, int I420ToI444(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
@ -136,19 +132,15 @@ int I420ToI444(const uint8* src_y, int src_stride_y,
int halfwidth = (width + 1) >> 1; int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1; int halfheight = (height + 1) >> 1;
// Upsample U plane. // Upsample U plane from from 1/2 width, 1/2 height to 1x width, 1x height.
ScalePlaneBilinear(halfwidth, halfheight, ScalePlane(src_u, src_stride_u, halfwidth, halfheight,
width, height, dst_u, dst_stride_u, width, height,
src_stride_u, kFilterNone);
dst_stride_u,
src_u, dst_u);
// Upsample V plane. // Upsample V plane.
ScalePlaneBilinear(halfwidth, halfheight, ScalePlane(src_v, src_stride_v, halfwidth, halfheight,
width, height, dst_v, dst_stride_v, width, height,
src_stride_v, kFilterNone);
dst_stride_v,
src_v, dst_v);
return 0; return 0;
} }
@ -187,19 +179,15 @@ int I420ToI411(const uint8* src_y, int src_stride_y,
int halfheight = (height + 1) >> 1; int halfheight = (height + 1) >> 1;
int quarterwidth = (width + 3) >> 2; int quarterwidth = (width + 3) >> 2;
// Resample U plane. // Resample U plane from 1/2 width, 1/2 height to 1/4 width, 1x height
ScalePlaneBilinear(halfwidth, halfheight, // from 1/2 width, 1/2 height ScalePlane(src_u, src_stride_u, halfwidth, halfheight,
quarterwidth, height, // to 1/4 width, 1x height dst_u, dst_stride_u,quarterwidth, height,
src_stride_u, kFilterNone);
dst_stride_u,
src_u, dst_u);
// Resample V plane. // Resample V plane.
ScalePlaneBilinear(halfwidth, halfheight, // from 1/2 width, 1/2 height ScalePlane(src_v, src_stride_v, halfwidth, halfheight,
quarterwidth, height, // to 1/4 width, 1x height dst_v, dst_stride_v,quarterwidth, height,
src_stride_v, kFilterNone);
dst_stride_v,
src_v, dst_v);
return 0; return 0;
} }
@ -360,7 +348,6 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
// TODO(fbarchard): Deprecate, move or expand 422 support?
LIBYUV_API LIBYUV_API
int I422ToUYVY(const uint8* src_y, int src_stride_y, int I422ToUYVY(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,

View File

@ -138,9 +138,8 @@ static int MipsCpuCaps(const char* search_string) {
#endif #endif
// CPU detect function for SIMD instruction sets. // CPU detect function for SIMD instruction sets.
// TODO(fbarchard): Use constant if/when valgrind says cpu_info is initialized.
LIBYUV_API LIBYUV_API
int cpu_info_ = 1; // 1 means cpu info is not initialized yet. int cpu_info_ = kCpuInit; // cpu_info is not initialized yet.
// Test environment variable for disabling CPU features. Any non-zero value // Test environment variable for disabling CPU features. Any non-zero value
// to disable. Zero ignored to make it easy to set the variable on/off. // to disable. Zero ignored to make it easy to set the variable on/off.

View File

@ -767,19 +767,33 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
} }
void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
int width) = ARGBAttenuateRow_C; int width) = ARGBAttenuateRow_C;
#if defined(HAS_ARGBATTENUATE_SSE2) #if defined(HAS_ARGBATTENUATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSE2; ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
} }
}
#endif #endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3) #if defined(HAS_ARGBATTENUATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && if (TestCpuFlag(kCpuHasSSSE3) && width >= 4 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
} }
}
#endif
#if defined(HAS_ARGBATTENUATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBAttenuateRow = ARGBAttenuateRow_NEON;
}
}
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
@ -1126,9 +1140,8 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
} }
// Interpolate 2 ARGB images by specified amount (0 to 255). // Interpolate 2 ARGB images by specified amount (0 to 255).
// TODO(fbarchard): Check width is multiple of 16. Do Any version. // TODO(fbarchard): Consider selecting a specialization for interpolation so
// TODO(fbarchard): Consider selecting a specialized interpolator so // row function doesn't need to check interpolation on each row.
// interpolation doesn't need to be checked on each row.
LIBYUV_API LIBYUV_API
int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1, const uint8* src_argb1, int src_stride_argb1,
@ -1147,15 +1160,14 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
ptrdiff_t src_stride, int dst_width, ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = ARGBInterpolateRow_C; int source_y_fraction) = ARGBInterpolateRow_C;
#if defined(HAS_ARGBINTERPOLATEROW_SSSE3) #if defined(HAS_ARGBINTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) && IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBInterpolateRow = ARGBInterpolateRow_SSSE3; ARGBInterpolateRow = ARGBInterpolateRow_SSSE3;
} }
#elif defined(HAS_ARGBINTERPOLATEROW_NEON) #elif defined(HAS_ARGBINTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
ARGBInterpolateRow = ARGBInterpolateRow_NEON; ARGBInterpolateRow = ARGBInterpolateRow_NEON;
} }
#endif #endif

View File

@ -113,8 +113,8 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"vtbl.8 d0, {d2, d3}, d6 \n" "vtbl.8 d0, {d2, d3}, d6 \n"
"vtbl.8 d1, {d2, d3}, d7 \n" "vtbl.8 d1, {d2, d3}, d7 \n"
// TODO: rework shuffle above to write // TODO(frkoenig): Rework shuffle above to
// out with 4 instead of 8 writes // write out with 4 instead of 8 writes.
"vst1.32 {d4[0]}, [r9], %3 \n" "vst1.32 {d4[0]}, [r9], %3 \n"
"vst1.32 {d4[1]}, [r9], %3 \n" "vst1.32 {d4[1]}, [r9], %3 \n"
"vst1.32 {d5[0]}, [r9], %3 \n" "vst1.32 {d5[0]}, [r9], %3 \n"
@ -276,7 +276,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"cmp %6, #4 \n" "cmp %6, #4 \n"
"blt 2f \n" "blt 2f \n"
//TODO(frkoenig) : clean this up //TODO(frkoenig): Clean this up
// 4x8 block // 4x8 block
"mov r9, %0 \n" "mov r9, %0 \n"
"vld1.64 {d0}, [r9], %1 \n" "vld1.64 {d0}, [r9], %1 \n"

View File

@ -141,6 +141,8 @@ RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
3, 4, 2) 3, 4, 2)
RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C, RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
7, 1, 4) 7, 1, 4)
RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,
7, 1, 4)
RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C, RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C,
15, 2, 4) 15, 2, 4)
RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C, RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C,
@ -157,6 +159,8 @@ RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C,
7, 4, 2) 7, 4, 2)
RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C, RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C,
7, 1, 4) 7, 1, 4)
RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C,
7, 1, 4)
RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C, RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C,
7, 2, 4) 7, 2, 4)
RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C, RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C,
@ -226,6 +230,28 @@ YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
#endif #endif
#undef YANY #undef YANY
// Attenuate is destructive so last16 method can not be used due to overlap.
#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
int n = width & ~MASK; \
ARGBTOY_SIMD(src_argb, dst_y, n); \
ARGBTOY_C(src_argb + n * SBPP, \
dst_y + n * BPP, width & MASK); \
}
#ifdef HAS_ARGBATTENUATEROW_SSSE3
YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C,
4, 4, 3)
#endif
#ifdef HAS_ARGBATTENUATEROW_SSE2
YANY(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, ARGBAttenuateRow_C,
4, 4, 3)
#endif
#ifdef HAS_ARGBATTENUATEROW_NEON
YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
4, 4, 7)
#endif
// RGB/YUV to UV does multiple of 16 with SIMD and remainder with C. // RGB/YUV to UV does multiple of 16 with SIMD and remainder with C.
#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP) \ #define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP) \
void NAMEANY(const uint8* src_argb, int src_stride_argb, \ void NAMEANY(const uint8* src_argb, int src_stride_argb, \

View File

@ -2418,6 +2418,61 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
); );
} }
// Attenuate 8 pixels at a time.
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
// Attenuate 8 pixels.
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q10, d0, d3 \n" // b * a
"vmull.u8 q11, d1, d3 \n" // g * a
"vmull.u8 q12, d2, d3 \n" // r * a
"vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
"vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
"vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "q0", "q1", "q10", "q11", "q12"
);
}
#ifdef ARGBATTENUATEROW_VQRDMULH
// TODO(fbarchard): Remove this. Works but is slower and off by 2.
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
// Attenuate 8 pixels.
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q0, d0 \n"
"vmovl.u8 q1, d2 \n"
"vmovl.u8 q2, d4 \n"
"vmovl.u8 q8, d6 \n"
"vshl.u16 q0, q0, #7 \n" // b << 7
"vshl.u16 q1, q1, #7 \n" // g << 7
"vshl.u16 q2, q2, #7 \n" // r << 7
"vqrdmulh.s16 q0, q0, q8 \n" // b * a
"vqrdmulh.s16 q1, q1, q8 \n" // g * a
"vqrdmulh.s16 q2, q2, q8 \n" // r * a
"vmovn.u16 d0, q0 \n"
"vmovn.u16 d2, q1 \n"
"vmovn.u16 d4, q2 \n"
"vst4.8 {d0, d2, d4, d6}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q8"
);
}
#endif
#endif // __ARM_NEON__ #endif // __ARM_NEON__
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -3519,7 +3519,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
} }
#endif // HAS_ARGBBLENDROW_SSSE3 #endif // HAS_ARGBBLENDROW_SSSE3
#ifdef HAS_ARGBATTENUATE_SSE2 #ifdef HAS_ARGBATTENUATEROW_SSE2
// Attenuate 4 pixels at a time. // Attenuate 4 pixels at a time.
// aligned to 16 bytes // aligned to 16 bytes
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
@ -3564,7 +3564,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
#endif #endif
); );
} }
#endif // HAS_ARGBATTENUATE_SSE2 #endif // HAS_ARGBATTENUATEROW_SSE2
#ifdef HAS_ARGBATTENUATEROW_SSSE3 #ifdef HAS_ARGBATTENUATEROW_SSSE3
// Shuffle table duplicating alpha // Shuffle table duplicating alpha
@ -4132,7 +4132,6 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
#ifdef HAS_ARGBAFFINEROW_SSE2 #ifdef HAS_ARGBAFFINEROW_SSE2
// TODO(fbarchard): Find 64 bit way to avoid masking. // TODO(fbarchard): Find 64 bit way to avoid masking.
// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
// Copy ARGB pixels from source image with slope to a row of destination. // Copy ARGB pixels from source image with slope to a row of destination.
// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing // Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
// an error if movq is used. movd %%xmm0,%1 // an error if movq is used. movd %%xmm0,%1

View File

@ -1675,7 +1675,6 @@ static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
// TODO(fbarchard): NV12/NV21 fetch UV and use directly.
// TODO(fbarchard): Read that does half size on Y and treats 420 as 444. // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
// Read 8 UV from 411. // Read 8 UV from 411.
@ -3701,7 +3700,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
} }
#endif // HAS_ARGBBLENDROW_SSSE3 #endif // HAS_ARGBBLENDROW_SSSE3
#ifdef HAS_ARGBATTENUATE_SSE2 #ifdef HAS_ARGBATTENUATEROW_SSE2
// Attenuate 4 pixels at a time. // Attenuate 4 pixels at a time.
// Aligned to 16 bytes. // Aligned to 16 bytes.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
@ -3743,7 +3742,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
ret ret
} }
} }
#endif // HAS_ARGBATTENUATE_SSE2 #endif // HAS_ARGBATTENUATEROW_SSE2
#ifdef HAS_ARGBATTENUATEROW_SSSE3 #ifdef HAS_ARGBATTENUATEROW_SSSE3
// Shuffle table duplicating alpha. // Shuffle table duplicating alpha.

View File

@ -3091,18 +3091,18 @@ void ScalePlaneBilinear(int src_width, int src_height,
int dst_width, int source_y_fraction) = int dst_width, int source_y_fraction) =
ScaleFilterRows_C; ScaleFilterRows_C;
#if defined(HAS_SCALEFILTERROWS_NEON) #if defined(HAS_SCALEFILTERROWS_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(src_width, 16)) {
ScaleFilterRows = ScaleFilterRows_NEON; ScaleFilterRows = ScaleFilterRows_NEON;
} }
#endif #endif
#if defined(HAS_SCALEFILTERROWS_SSE2) #if defined(HAS_SCALEFILTERROWS_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16) &&
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
ScaleFilterRows = ScaleFilterRows_SSE2; ScaleFilterRows = ScaleFilterRows_SSE2;
} }
#endif #endif
#if defined(HAS_SCALEFILTERROWS_SSSE3) #if defined(HAS_SCALEFILTERROWS_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(src_width, 16)) {
ScaleFilterRows = ScaleFilterRows_Unaligned_SSSE3; ScaleFilterRows = ScaleFilterRows_Unaligned_SSSE3;
if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
ScaleFilterRows = ScaleFilterRows_SSSE3; ScaleFilterRows = ScaleFilterRows_SSSE3;
@ -3110,7 +3110,7 @@ void ScalePlaneBilinear(int src_width, int src_height,
} }
#endif #endif
#if defined(HAS_SCALEFILTERROWS_MIPS_DSPR2) #if defined(HAS_SCALEFILTERROWS_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4)) { IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4)) {
ScaleFilterRows = ScaleFilterRows_MIPS_DSPR2; ScaleFilterRows = ScaleFilterRows_MIPS_DSPR2;
} }
@ -3129,7 +3129,6 @@ void ScalePlaneBilinear(int src_width, int src_height,
int yf = (y >> 8) & 255; int yf = (y >> 8) & 255;
const uint8* src = src_ptr + yi * src_stride; const uint8* src = src_ptr + yi * src_stride;
ScaleFilterRows(row, src, src_stride, src_width, yf); ScaleFilterRows(row, src, src_stride, src_width, yf);
row[src_width] = row[src_width - 1];
ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx); ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
dst_ptr += dst_stride; dst_ptr += dst_stride;
y += dy; y += dy;

View File

@ -856,8 +856,7 @@ void ScaleARGBFilterRows_C(uint8* dst_argb, const uint8* src_argb,
int y1_fraction = source_y_fraction; int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction; int y0_fraction = 256 - y1_fraction;
const uint8* src_ptr1 = src_argb + src_stride; const uint8* src_ptr1 = src_argb + src_stride;
uint8* end = dst_argb + (dst_width << 2); for (int x = 0; x < dst_width - 1; x += 2) {
do {
dst_argb[0] = (src_argb[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; dst_argb[0] = (src_argb[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
dst_argb[1] = (src_argb[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; dst_argb[1] = (src_argb[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
dst_argb[2] = (src_argb[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8; dst_argb[2] = (src_argb[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
@ -869,7 +868,14 @@ void ScaleARGBFilterRows_C(uint8* dst_argb, const uint8* src_argb,
src_argb += 8; src_argb += 8;
src_ptr1 += 8; src_ptr1 += 8;
dst_argb += 8; dst_argb += 8;
} while (dst_argb < end); }
if (dst_width & 1) {
dst_argb[0] = (src_argb[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
dst_argb[1] = (src_argb[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
dst_argb[2] = (src_argb[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
dst_argb[3] = (src_argb[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
dst_argb += 4;
}
// Duplicate the last pixel (4 bytes) for filtering. // Duplicate the last pixel (4 bytes) for filtering.
dst_argb[0] = dst_argb[-4]; dst_argb[0] = dst_argb[-4];
dst_argb[1] = dst_argb[-3]; dst_argb[1] = dst_argb[-3];
@ -975,21 +981,20 @@ static void ScaleARGBBilinear(int src_width, int src_height,
ptrdiff_t src_stride, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) = int dst_width, int source_y_fraction) =
ScaleARGBFilterRows_C; ScaleARGBFilterRows_C;
// TODO(fbarchard): Check aligned width.
#if defined(HAS_SCALEARGBFILTERROWS_SSE2) #if defined(HAS_SCALEARGBFILTERROWS_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 4) &&
IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_argb, 16)) { IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
ScaleARGBFilterRows = ScaleARGBFilterRows_SSE2; ScaleARGBFilterRows = ScaleARGBFilterRows_SSE2;
} }
#endif #endif
#if defined(HAS_SCALEARGBFILTERROWS_SSSE3) #if defined(HAS_SCALEARGBFILTERROWS_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(src_width, 4) &&
IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_argb, 16)) { IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3; ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3;
} }
#endif #endif
#if defined(HAS_SCALEARGBFILTERROWS_NEON) #if defined(HAS_SCALEARGBFILTERROWS_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(src_width, 4)) {
ScaleARGBFilterRows = ScaleARGBFilterRows_NEON; ScaleARGBFilterRows = ScaleARGBFilterRows_NEON;
} }
#endif #endif

View File

@ -478,8 +478,8 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \
/* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \ /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
align_buffer_64(dst_argb32_c, kWidth * 4 * kHeight); \ align_buffer_64(dst_argb32_c, kWidth * 4 * kHeight); \
align_buffer_64(dst_argb32_opt, kWidth * 4 * kHeight); \ align_buffer_64(dst_argb32_opt, kWidth * 4 * kHeight); \
memset(dst_argb32_c, 0, kWidth * 4 * kHeight); \ memset(dst_argb32_c, 1, kWidth * 4 * kHeight); \
memset(dst_argb32_opt, 0, kWidth * 4 * kHeight); \ memset(dst_argb32_opt, 2, kWidth * 4 * kHeight); \
FMT_B##ToARGB(dst_argb_c, kStrideB, \ FMT_B##ToARGB(dst_argb_c, kStrideB, \
dst_argb32_c, kWidth * 4, \ dst_argb32_c, kWidth * 4, \
kWidth, kHeight); \ kWidth, kHeight); \
@ -534,6 +534,12 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
align_buffer_64(dst_y_opt, kWidth * kHeight); \ align_buffer_64(dst_y_opt, kWidth * kHeight); \
align_buffer_64(dst_u_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ align_buffer_64(dst_u_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \
align_buffer_64(dst_v_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ align_buffer_64(dst_v_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \
memset(dst_y_c, 1, kWidth * kHeight); \
memset(dst_u_c, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \
memset(dst_v_c, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \
memset(dst_y_opt, 2, kWidth * kHeight); \
memset(dst_u_opt, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \
memset(dst_v_opt, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \
srandom(time(NULL)); \ srandom(time(NULL)); \
for (int i = 0; i < kHeight; ++i) \ for (int i = 0; i < kHeight; ++i) \
for (int j = 0; j < kStride; ++j) \ for (int j = 0; j < kStride; ++j) \
@ -753,11 +759,11 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) { \
align_buffer_page_end(src_argb, kStrideA * kHeightA); \ align_buffer_page_end(src_argb, kStrideA * kHeightA); \
align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \ align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \
align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \ align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \
memset(dst_argb_c, 0, kStrideB * kHeightB); \
memset(dst_argb_opt, 0, kStrideB * kHeightB); \
for (int i = 0; i < kStrideA * kHeightA; ++i) { \ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
src_argb[i] = (random() & 0xff); \ src_argb[i] = (random() & 0xff); \
} \ } \
memset(dst_argb_c, 0, kStrideB * kHeightB); \
memset(dst_argb_opt, 0, kStrideB * kHeightB); \
MaskCpuFlags(0); \ MaskCpuFlags(0); \
FMT_A##To##FMT_B(src_argb, kStrideA, \ FMT_A##To##FMT_B(src_argb, kStrideA, \
dst_argb_c, kStrideB, \ dst_argb_c, kStrideB, \

View File

@ -98,12 +98,75 @@ TEST_F(libyuvTest, TestAttenuate) {
EXPECT_EQ(32, atten_pixels[128][1]); EXPECT_EQ(32, atten_pixels[128][1]);
EXPECT_EQ(21, atten_pixels[128][2]); EXPECT_EQ(21, atten_pixels[128][2]);
EXPECT_EQ(128, atten_pixels[128][3]); EXPECT_EQ(128, atten_pixels[128][3]);
EXPECT_EQ(255, atten_pixels[255][0]); EXPECT_NEAR(255, atten_pixels[255][0], 1);
EXPECT_EQ(127, atten_pixels[255][1]); EXPECT_NEAR(127, atten_pixels[255][1], 1);
EXPECT_EQ(85, atten_pixels[255][2]); EXPECT_NEAR(85, atten_pixels[255][2], 1);
EXPECT_EQ(255, atten_pixels[255][3]); EXPECT_EQ(255, atten_pixels[255][3]);
} }
static int TestAttenuateI(int width, int height, int benchmark_iterations,
int invert, int off) {
const int kBpp = 4;
const int kStride = (width * kBpp + 15) & ~15;
align_buffer_64(src_argb, kStride * height + off);
align_buffer_64(dst_argb_c, kStride * height);
align_buffer_64(dst_argb_opt, kStride * height);
srandom(time(NULL));
for (int i = 0; i < kStride * height; ++i) {
src_argb[i + off] = (random() & 0xff);
}
memset(dst_argb_c, 0, kStride * height);
memset(dst_argb_opt, 0, kStride * height);
MaskCpuFlags(0);
ARGBAttenuate(src_argb + off, kStride,
dst_argb_c, kStride,
width, invert * height);
MaskCpuFlags(-1);
for (int i = 0; i < benchmark_iterations; ++i) {
ARGBAttenuate(src_argb + off, kStride,
dst_argb_opt, kStride,
width, invert * height);
}
int max_diff = 0;
for (int i = 0; i < kStride * height; ++i) {
int abs_diff =
abs(static_cast<int>(dst_argb_c[i]) -
static_cast<int>(dst_argb_opt[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
}
free_aligned_buffer_64(src_argb)
free_aligned_buffer_64(dst_argb_c)
free_aligned_buffer_64(dst_argb_opt)
return max_diff;
}
TEST_F(libyuvTest, ARGBAttenuate_Any) {
int max_diff = TestAttenuateI(benchmark_width_ - 1, benchmark_height_,
benchmark_iterations_, +1, 0);
EXPECT_LE(max_diff, 2);
}
TEST_F(libyuvTest, ARGBAttenuate_Unaligned) {
int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
benchmark_iterations_, +1, 1);
EXPECT_LE(max_diff, 2);
}
TEST_F(libyuvTest, ARGBAttenuate_Invert) {
int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
benchmark_iterations_, -1, 0);
EXPECT_LE(max_diff, 2);
}
TEST_F(libyuvTest, ARGBAttenuate_Opt) {
int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
benchmark_iterations_, +1, 0);
EXPECT_LE(max_diff, 2);
}
TEST_F(libyuvTest, TestARGBComputeCumulativeSum) { TEST_F(libyuvTest, TestARGBComputeCumulativeSum) {
SIMD_ALIGNED(uint8 orig_pixels[16][16][4]); SIMD_ALIGNED(uint8 orig_pixels[16][16][4]);
SIMD_ALIGNED(int32 added_pixels[16][16][4]); SIMD_ALIGNED(int32 added_pixels[16][16][4]);
@ -632,7 +695,7 @@ TEST_F(libyuvTest, ARGBInterpolate##TERP##N) { \
#define TESTINTERPOLATE(TERP) \ #define TESTINTERPOLATE(TERP) \
TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \
benchmark_width_ - 4, TERP, 1, _Any, +, 0) \ benchmark_width_ - 1, TERP, 1, _Any, +, 0) \
TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \
benchmark_width_, TERP, 1, _Unaligned, +, 1) \ benchmark_width_, TERP, 1, _Unaligned, +, 1) \
TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \
@ -648,42 +711,38 @@ TESTINTERPOLATE(255)
static int TestBlend(int width, int height, int benchmark_iterations, static int TestBlend(int width, int height, int benchmark_iterations,
int invert, int off) { int invert, int off) {
const int BPP_A = 4; const int kBpp = 4;
const int STRIDE_A = 1; const int kStride = width * kBpp;
const int BPP_B = 4; align_buffer_64(src_argb_a, kStride * height + off);
const int STRIDE_B = 1; align_buffer_64(src_argb_b, kStride * height + off);
const int kStrideA = (width * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; align_buffer_64(dst_argb_c, kStride * height);
const int kStrideB = (width * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; align_buffer_64(dst_argb_opt, kStride * height);
align_buffer_64(src_argb_a, kStrideA * height + off);
align_buffer_64(src_argb_b, kStrideA * height + off);
align_buffer_64(dst_argb_c, kStrideB * height);
align_buffer_64(dst_argb_opt, kStrideB * height);
srandom(time(NULL)); srandom(time(NULL));
for (int i = 0; i < kStrideA * height; ++i) { for (int i = 0; i < kStride * height; ++i) {
src_argb_a[i + off] = (random() & 0xff); src_argb_a[i + off] = (random() & 0xff);
src_argb_b[i + off] = (random() & 0xff); src_argb_b[i + off] = (random() & 0xff);
} }
ARGBAttenuate(src_argb_a + off, kStrideA, src_argb_a + off, kStrideA, width, ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width,
height); height);
ARGBAttenuate(src_argb_b + off, kStrideA, src_argb_b + off, kStrideA, width, ARGBAttenuate(src_argb_b + off, kStride, src_argb_b + off, kStride, width,
height); height);
memset(dst_argb_c, 255, kStrideB * height); memset(dst_argb_c, 255, kStride * height);
memset(dst_argb_opt, 255, kStrideB * height); memset(dst_argb_opt, 255, kStride * height);
MaskCpuFlags(0); MaskCpuFlags(0);
ARGBBlend(src_argb_a + off, kStrideA, ARGBBlend(src_argb_a + off, kStride,
src_argb_b + off, kStrideA, src_argb_b + off, kStride,
dst_argb_c, kStrideB, dst_argb_c, kStride,
width, invert * height); width, invert * height);
MaskCpuFlags(-1); MaskCpuFlags(-1);
for (int i = 0; i < benchmark_iterations; ++i) { for (int i = 0; i < benchmark_iterations; ++i) {
ARGBBlend(src_argb_a + off, kStrideA, ARGBBlend(src_argb_a + off, kStride,
src_argb_b + off, kStrideA, src_argb_b + off, kStride,
dst_argb_opt, kStrideB, dst_argb_opt, kStride,
width, invert * height); width, invert * height);
} }
int max_diff = 0; int max_diff = 0;
for (int i = 0; i < kStrideB * height; ++i) { for (int i = 0; i < kStride * height; ++i) {
int abs_diff = int abs_diff =
abs(static_cast<int>(dst_argb_c[i]) - abs(static_cast<int>(dst_argb_c[i]) -
static_cast<int>(dst_argb_opt[i])); static_cast<int>(dst_argb_opt[i]));