From 1d160cb99f2b05df80c4555bd769825ad1175dc9 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Wed, 28 Nov 2012 20:02:55 +0000 Subject: [PATCH] Attenuate AGRB pixels NEON optimized BUG=164 TEST=./libyuv_unittest --gtest_filter=*Atten* Review URL: https://webrtc-codereview.appspot.com/937031 git-svn-id: http://libyuv.googlecode.com/svn/trunk@506 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/cpu_id.h | 2 +- include/libyuv/row.h | 16 +++++- include/libyuv/version.h | 2 +- source/convert.cc | 29 ++++------ source/convert_argb.cc | 16 ++++-- source/convert_from.cc | 45 ++++++--------- source/cpu_id.cc | 3 +- source/planar_functions.cc | 34 ++++++++---- source/rotate_neon.cc | 6 +- source/row_any.cc | 26 +++++++++ source/row_neon.cc | 55 ++++++++++++++++++ source/row_posix.cc | 5 +- source/row_win.cc | 5 +- source/scale.cc | 9 ++- source/scale_argb.cc | 23 +++++--- unit_test/convert_test.cc | 14 +++-- unit_test/planar_test.cc | 111 ++++++++++++++++++++++++++++--------- 18 files changed, 280 insertions(+), 123 deletions(-) diff --git a/README.chromium b/README.chromium index 78e01936c..ea7b076b9 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 505 +Version: 506 License: BSD License File: LICENSE diff --git a/include/libyuv/cpu_id.h b/include/libyuv/cpu_id.h index 7727f2760..0c50886cf 100644 --- a/include/libyuv/cpu_id.h +++ b/include/libyuv/cpu_id.h @@ -53,7 +53,7 @@ int ArmCpuCaps(const char* cpuinfo_name); // returns non-zero if instruction set is detected static __inline int TestCpuFlag(int test_flag) { LIBYUV_API extern int cpu_info_; - return (cpu_info_ == 1 ? InitCpuFlags() : cpu_info_) & test_flag; + return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag; } // For testing, allow CPU flags to be disabled. diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 7906d0153..2435138ec 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -139,7 +139,7 @@ extern "C" { #if !defined(YUV_DISABLE_ASM) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ !defined(LIBYUV_SSSE3_ONLY) -#define HAS_ARGBATTENUATE_SSE2 +#define HAS_ARGBATTENUATEROW_SSE2 #define HAS_ARGBBLENDROW_SSE2 #define HAS_MIRRORROW_SSE2 #endif @@ -221,6 +221,7 @@ extern "C" { // Effects #define HAS_ARGBINTERPOLATEROW_NEON #define HAS_ARGBBLENDROW_NEON +#define HAS_ARGBATTENUATEROW_NEON #endif // The following are available on Mips platforms @@ -935,6 +936,12 @@ void YToARGBRow_SSE2(const uint8* src_y, void YToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width); +void YToARGBRow_Any_SSE2(const uint8* src_y, + uint8* dst_argb, + int width); +void YToARGBRow_Any_NEON(const uint8* src_y, + uint8* dst_argb, + int width); // ARGB preattenuated alpha blend. void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1, @@ -1194,6 +1201,13 @@ void I422ToUYVYRow_Any_NEON(const uint8* src_y, void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, + int width); +void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width); +void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb, + int width); // Inverse table for unattenuate, shared by C and SSE2. extern uint32 fixed_invtbl8[256]; diff --git a/include/libyuv/version.h b/include/libyuv/version.h index af349e63e..90da7968c 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 505 +#define LIBYUV_VERSION 506 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert.cc b/source/convert.cc index a35cd1769..11d32cd77 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -18,6 +18,7 @@ #endif #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" +#include "libyuv/scale.h" // For ScalePlane() #include "libyuv/video_common.h" #include "libyuv/row.h" @@ -215,12 +216,7 @@ int I444ToI420(const uint8* src_y, int src_stride_y, return 0; } -// use Bilinear for upsampling chroma -void ScalePlaneBilinear(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr); - +// TODO(fbarchard): Enable bilinear when fast enough or specialized upsampler. // 411 chroma is 1/4 width, 1x height // 420 chroma is 1/2 width, 1/2 height LIBYUV_API @@ -256,19 +252,15 @@ int I411ToI420(const uint8* src_y, int src_stride_y, int halfheight = (height + 1) >> 1; int quarterwidth = (width + 3) >> 2; - // Resample U plane. - ScalePlaneBilinear(quarterwidth, height, // from 1/4 width, 1x height - halfwidth, halfheight, // to 1/2 width, 1/2 height - src_stride_u, - dst_stride_u, - src_u, dst_u); + // Resample U plane from 1/4 width, 1x height to 1/2 width, 1/2 height. + ScalePlane(src_u, src_stride_u, quarterwidth, height, + dst_u, dst_stride_u, halfwidth, halfheight, + kFilterNone); // Resample V plane. - ScalePlaneBilinear(quarterwidth, height, // from 1/4 width, 1x height - halfwidth, halfheight, // to 1/2 width, 1/2 height - src_stride_v, - dst_stride_v, - src_v, dst_v); + ScalePlane(src_v, src_stride_v, quarterwidth, height, + dst_v, dst_stride_v, halfwidth, halfheight, + kFilterNone); return 0; } @@ -1738,7 +1730,6 @@ static void JpegI400ToI420(void* opaque, LIBYUV_API int MJPGSize(const uint8* sample, size_t sample_size, int* width, int* height) { - // TODO(fbarchard): Port to C MJpegDecoder mjpeg_decoder; bool ret = mjpeg_decoder.LoadFrame(sample, sample_size); if (ret) { @@ -1764,7 +1755,7 @@ int MJPGToI420(const uint8* sample, return -1; } - // TODO(fbarchard): Port to C + // TODO(fbarchard): Port MJpeg to C. MJpegDecoder mjpeg_decoder; bool ret = mjpeg_decoder.LoadFrame(sample, sample_size); if (ret && (mjpeg_decoder.GetWidth() != w || diff --git a/source/convert_argb.cc b/source/convert_argb.cc index b9c46d222..b9ec60f98 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -230,13 +230,19 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, uint8* rgb_buf, int width) = YToARGBRow_C; #if defined(HAS_YTOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8) && + if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - YToARGBRow = YToARGBRow_SSE2; + YToARGBRow = YToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + YToARGBRow = YToARGBRow_SSE2; + } } #elif defined(HAS_YTOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - YToARGBRow = YToARGBRow_NEON; + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + YToARGBRow = YToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + YToARGBRow = YToARGBRow_NEON; + } } #endif @@ -941,7 +947,7 @@ int MJPGToARGB(const uint8* sample, return -1; } - // TODO(fbarchard): Port to C + // TODO(fbarchard): Port MJpeg to C. MJpegDecoder mjpeg_decoder; bool ret = mjpeg_decoder.LoadFrame(sample, sample_size); if (ret && (mjpeg_decoder.GetWidth() != w || diff --git a/source/convert_from.cc b/source/convert_from.cc index a4233b1c2..549af8564 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -16,6 +16,7 @@ #include "libyuv/format_conversion.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" +#include "libyuv/scale.h" // For ScalePlane() #include "libyuv/video_common.h" #include "libyuv/row.h" @@ -98,12 +99,7 @@ int I420ToI422(const uint8* src_y, int src_stride_y, return 0; } -// use Bilinear for upsampling chroma -void ScalePlaneBilinear(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr); - +// TODO(fbarchard): Enable bilinear when fast enough or specialized upsampler. LIBYUV_API int I420ToI444(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, @@ -136,19 +132,15 @@ int I420ToI444(const uint8* src_y, int src_stride_y, int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - // Upsample U plane. - ScalePlaneBilinear(halfwidth, halfheight, - width, height, - src_stride_u, - dst_stride_u, - src_u, dst_u); + // Upsample U plane from from 1/2 width, 1/2 height to 1x width, 1x height. + ScalePlane(src_u, src_stride_u, halfwidth, halfheight, + dst_u, dst_stride_u, width, height, + kFilterNone); // Upsample V plane. - ScalePlaneBilinear(halfwidth, halfheight, - width, height, - src_stride_v, - dst_stride_v, - src_v, dst_v); + ScalePlane(src_v, src_stride_v, halfwidth, halfheight, + dst_v, dst_stride_v, width, height, + kFilterNone); return 0; } @@ -187,19 +179,15 @@ int I420ToI411(const uint8* src_y, int src_stride_y, int halfheight = (height + 1) >> 1; int quarterwidth = (width + 3) >> 2; - // Resample U plane. - ScalePlaneBilinear(halfwidth, halfheight, // from 1/2 width, 1/2 height - quarterwidth, height, // to 1/4 width, 1x height - src_stride_u, - dst_stride_u, - src_u, dst_u); + // Resample U plane from 1/2 width, 1/2 height to 1/4 width, 1x height + ScalePlane(src_u, src_stride_u, halfwidth, halfheight, + dst_u, dst_stride_u,quarterwidth, height, + kFilterNone); // Resample V plane. - ScalePlaneBilinear(halfwidth, halfheight, // from 1/2 width, 1/2 height - quarterwidth, height, // to 1/4 width, 1x height - src_stride_v, - dst_stride_v, - src_v, dst_v); + ScalePlane(src_v, src_stride_v, halfwidth, halfheight, + dst_v, dst_stride_v,quarterwidth, height, + kFilterNone); return 0; } @@ -360,7 +348,6 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y, return 0; } -// TODO(fbarchard): Deprecate, move or expand 422 support? LIBYUV_API int I422ToUYVY(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, diff --git a/source/cpu_id.cc b/source/cpu_id.cc index 4032080f9..b932beb54 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -138,9 +138,8 @@ static int MipsCpuCaps(const char* search_string) { #endif // CPU detect function for SIMD instruction sets. -// TODO(fbarchard): Use constant if/when valgrind says cpu_info is initialized. LIBYUV_API -int cpu_info_ = 1; // 1 means cpu info is not initialized yet. +int cpu_info_ = kCpuInit; // cpu_info is not initialized yet. // Test environment variable for disabling CPU features. Any non-zero value // to disable. Zero ignored to make it easy to set the variable on/off. diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 2da083abd..6b5c3ecfd 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -767,18 +767,32 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, } void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, int width) = ARGBAttenuateRow_C; -#if defined(HAS_ARGBATTENUATE_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && +#if defined(HAS_ARGBATTENUATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSE2; + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSE2; + } } #endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && + if (TestCpuFlag(kCpuHasSSSE3) && width >= 4 && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } } #endif @@ -1126,9 +1140,8 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, } // Interpolate 2 ARGB images by specified amount (0 to 255). -// TODO(fbarchard): Check width is multiple of 16. Do Any version. -// TODO(fbarchard): Consider selecting a specialized interpolator so -// interpolation doesn't need to be checked on each row. +// TODO(fbarchard): Consider selecting a specialization for interpolation so +// row function doesn't need to check interpolation on each row. LIBYUV_API int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, const uint8* src_argb1, int src_stride_argb1, @@ -1147,15 +1160,14 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = ARGBInterpolateRow_C; #if defined(HAS_ARGBINTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { ARGBInterpolateRow = ARGBInterpolateRow_SSSE3; } #elif defined(HAS_ARGBINTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) { ARGBInterpolateRow = ARGBInterpolateRow_NEON; } #endif diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc index 49b300325..a0b3d291f 100644 --- a/source/rotate_neon.cc +++ b/source/rotate_neon.cc @@ -113,8 +113,8 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, "vtbl.8 d0, {d2, d3}, d6 \n" "vtbl.8 d1, {d2, d3}, d7 \n" - // TODO: rework shuffle above to write - // out with 4 instead of 8 writes + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. "vst1.32 {d4[0]}, [r9], %3 \n" "vst1.32 {d4[1]}, [r9], %3 \n" "vst1.32 {d5[0]}, [r9], %3 \n" @@ -276,7 +276,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "cmp %6, #4 \n" "blt 2f \n" - //TODO(frkoenig) : clean this up + //TODO(frkoenig): Clean this up // 4x8 block "mov r9, %0 \n" "vld1.64 {d0}, [r9], %1 \n" diff --git a/source/row_any.cc b/source/row_any.cc index 78921afd2..ce9f352c1 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -141,6 +141,8 @@ RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C, 3, 4, 2) RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C, 7, 1, 4) +RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C, + 7, 1, 4) RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C, 15, 2, 4) RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C, @@ -157,6 +159,8 @@ RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C, 7, 4, 2) RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C, 7, 1, 4) +RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C, + 7, 1, 4) RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C, 7, 2, 4) RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C, @@ -226,6 +230,28 @@ YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8) #endif #undef YANY +// Attenuate is destructive so last16 method can not be used due to overlap. +#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \ + int n = width & ~MASK; \ + ARGBTOY_SIMD(src_argb, dst_y, n); \ + ARGBTOY_C(src_argb + n * SBPP, \ + dst_y + n * BPP, width & MASK); \ + } + +#ifdef HAS_ARGBATTENUATEROW_SSSE3 +YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C, + 4, 4, 3) +#endif +#ifdef HAS_ARGBATTENUATEROW_SSE2 +YANY(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, ARGBAttenuateRow_C, + 4, 4, 3) +#endif +#ifdef HAS_ARGBATTENUATEROW_NEON +YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C, + 4, 4, 7) +#endif + // RGB/YUV to UV does multiple of 16 with SIMD and remainder with C. #define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP) \ void NAMEANY(const uint8* src_argb, int src_stride_argb, \ diff --git a/source/row_neon.cc b/source/row_neon.cc index 67ff79736..d859ca7b3 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -2418,6 +2418,61 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ); } +// Attenuate 8 pixels at a time. +void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + // Attenuate 8 pixels. + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d0, d3 \n" // b * a + "vmull.u8 q11, d1, d3 \n" // g * a + "vmull.u8 q12, d2, d3 \n" // r * a + "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 + "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 + "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q10", "q11", "q12" + ); +} + +#ifdef ARGBATTENUATEROW_VQRDMULH +// TODO(fbarchard): Remove this. Works but is slower and off by 2. +void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + // Attenuate 8 pixels. + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q0, d0 \n" + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q2, d4 \n" + "vmovl.u8 q8, d6 \n" + "vshl.u16 q0, q0, #7 \n" // b << 7 + "vshl.u16 q1, q1, #7 \n" // g << 7 + "vshl.u16 q2, q2, #7 \n" // r << 7 + "vqrdmulh.s16 q0, q0, q8 \n" // b * a + "vqrdmulh.s16 q1, q1, q8 \n" // g * a + "vqrdmulh.s16 q2, q2, q8 \n" // r * a + "vmovn.u16 d0, q0 \n" + "vmovn.u16 d2, q1 \n" + "vmovn.u16 d4, q2 \n" + "vst4.8 {d0, d2, d4, d6}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q8" + ); +} +#endif + #endif // __ARM_NEON__ #ifdef __cplusplus diff --git a/source/row_posix.cc b/source/row_posix.cc index aab0e920d..920a8c404 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -3519,7 +3519,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBBLENDROW_SSSE3 -#ifdef HAS_ARGBATTENUATE_SSE2 +#ifdef HAS_ARGBATTENUATEROW_SSE2 // Attenuate 4 pixels at a time. // aligned to 16 bytes void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { @@ -3564,7 +3564,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { #endif ); } -#endif // HAS_ARGBATTENUATE_SSE2 +#endif // HAS_ARGBATTENUATEROW_SSE2 #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha @@ -4132,7 +4132,6 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, #ifdef HAS_ARGBAFFINEROW_SSE2 // TODO(fbarchard): Find 64 bit way to avoid masking. -// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2. // Copy ARGB pixels from source image with slope to a row of destination. // Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing // an error if movq is used. movd %%xmm0,%1 diff --git a/source/row_win.cc b/source/row_win.cc index a2d96124a..b0d8a1117 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -1675,7 +1675,6 @@ static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; -// TODO(fbarchard): NV12/NV21 fetch UV and use directly. // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. // Read 8 UV from 411. @@ -3701,7 +3700,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBBLENDROW_SSSE3 -#ifdef HAS_ARGBATTENUATE_SSE2 +#ifdef HAS_ARGBATTENUATEROW_SSE2 // Attenuate 4 pixels at a time. // Aligned to 16 bytes. __declspec(naked) __declspec(align(16)) @@ -3743,7 +3742,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ret } } -#endif // HAS_ARGBATTENUATE_SSE2 +#endif // HAS_ARGBATTENUATEROW_SSE2 #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha. diff --git a/source/scale.cc b/source/scale.cc index f686dc67e..0c6036a74 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -3091,18 +3091,18 @@ void ScalePlaneBilinear(int src_width, int src_height, int dst_width, int source_y_fraction) = ScaleFilterRows_C; #if defined(HAS_SCALEFILTERROWS_NEON) - if (TestCpuFlag(kCpuHasNEON)) { + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(src_width, 16)) { ScaleFilterRows = ScaleFilterRows_NEON; } #endif #if defined(HAS_SCALEFILTERROWS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16) && IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { ScaleFilterRows = ScaleFilterRows_SSE2; } #endif #if defined(HAS_SCALEFILTERROWS_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(src_width, 16)) { ScaleFilterRows = ScaleFilterRows_Unaligned_SSSE3; if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { ScaleFilterRows = ScaleFilterRows_SSSE3; @@ -3110,7 +3110,7 @@ void ScalePlaneBilinear(int src_width, int src_height, } #endif #if defined(HAS_SCALEFILTERROWS_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4)) { ScaleFilterRows = ScaleFilterRows_MIPS_DSPR2; } @@ -3129,7 +3129,6 @@ void ScalePlaneBilinear(int src_width, int src_height, int yf = (y >> 8) & 255; const uint8* src = src_ptr + yi * src_stride; ScaleFilterRows(row, src, src_stride, src_width, yf); - row[src_width] = row[src_width - 1]; ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx); dst_ptr += dst_stride; y += dy; diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 635269ad2..c0a8b8912 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -856,8 +856,7 @@ void ScaleARGBFilterRows_C(uint8* dst_argb, const uint8* src_argb, int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; const uint8* src_ptr1 = src_argb + src_stride; - uint8* end = dst_argb + (dst_width << 2); - do { + for (int x = 0; x < dst_width - 1; x += 2) { dst_argb[0] = (src_argb[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; dst_argb[1] = (src_argb[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; dst_argb[2] = (src_argb[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8; @@ -869,7 +868,14 @@ void ScaleARGBFilterRows_C(uint8* dst_argb, const uint8* src_argb, src_argb += 8; src_ptr1 += 8; dst_argb += 8; - } while (dst_argb < end); + } + if (dst_width & 1) { + dst_argb[0] = (src_argb[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + dst_argb[1] = (src_argb[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; + dst_argb[2] = (src_argb[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8; + dst_argb[3] = (src_argb[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8; + dst_argb += 4; + } // Duplicate the last pixel (4 bytes) for filtering. dst_argb[0] = dst_argb[-4]; dst_argb[1] = dst_argb[-3]; @@ -975,21 +981,20 @@ static void ScaleARGBBilinear(int src_width, int src_height, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = ScaleARGBFilterRows_C; -// TODO(fbarchard): Check aligned width. #if defined(HAS_SCALEARGBFILTERROWS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) { ScaleARGBFilterRows = ScaleARGBFilterRows_SSE2; } #endif #if defined(HAS_SCALEARGBFILTERROWS_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && - IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(src_width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) { ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3; } #endif #if defined(HAS_SCALEARGBFILTERROWS_NEON) - if (TestCpuFlag(kCpuHasNEON)) { + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(src_width, 4)) { ScaleARGBFilterRows = ScaleARGBFilterRows_NEON; } #endif diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 3e4de6dc5..01c09b0ae 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -478,8 +478,8 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \ align_buffer_64(dst_argb32_c, kWidth * 4 * kHeight); \ align_buffer_64(dst_argb32_opt, kWidth * 4 * kHeight); \ - memset(dst_argb32_c, 0, kWidth * 4 * kHeight); \ - memset(dst_argb32_opt, 0, kWidth * 4 * kHeight); \ + memset(dst_argb32_c, 1, kWidth * 4 * kHeight); \ + memset(dst_argb32_opt, 2, kWidth * 4 * kHeight); \ FMT_B##ToARGB(dst_argb_c, kStrideB, \ dst_argb32_c, kWidth * 4, \ kWidth, kHeight); \ @@ -534,6 +534,12 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \ align_buffer_64(dst_y_opt, kWidth * kHeight); \ align_buffer_64(dst_u_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ align_buffer_64(dst_v_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ + memset(dst_y_c, 1, kWidth * kHeight); \ + memset(dst_u_c, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ + memset(dst_v_c, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ + memset(dst_y_opt, 2, kWidth * kHeight); \ + memset(dst_u_opt, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ + memset(dst_v_opt, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ srandom(time(NULL)); \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kStride; ++j) \ @@ -753,11 +759,11 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) { \ align_buffer_page_end(src_argb, kStrideA * kHeightA); \ align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \ align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \ + memset(dst_argb_c, 0, kStrideB * kHeightB); \ + memset(dst_argb_opt, 0, kStrideB * kHeightB); \ for (int i = 0; i < kStrideA * kHeightA; ++i) { \ src_argb[i] = (random() & 0xff); \ } \ - memset(dst_argb_c, 0, kStrideB * kHeightB); \ - memset(dst_argb_opt, 0, kStrideB * kHeightB); \ MaskCpuFlags(0); \ FMT_A##To##FMT_B(src_argb, kStrideA, \ dst_argb_c, kStrideB, \ diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 11b286b13..169d96317 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -98,12 +98,75 @@ TEST_F(libyuvTest, TestAttenuate) { EXPECT_EQ(32, atten_pixels[128][1]); EXPECT_EQ(21, atten_pixels[128][2]); EXPECT_EQ(128, atten_pixels[128][3]); - EXPECT_EQ(255, atten_pixels[255][0]); - EXPECT_EQ(127, atten_pixels[255][1]); - EXPECT_EQ(85, atten_pixels[255][2]); + EXPECT_NEAR(255, atten_pixels[255][0], 1); + EXPECT_NEAR(127, atten_pixels[255][1], 1); + EXPECT_NEAR(85, atten_pixels[255][2], 1); EXPECT_EQ(255, atten_pixels[255][3]); } +static int TestAttenuateI(int width, int height, int benchmark_iterations, + int invert, int off) { + const int kBpp = 4; + const int kStride = (width * kBpp + 15) & ~15; + align_buffer_64(src_argb, kStride * height + off); + align_buffer_64(dst_argb_c, kStride * height); + align_buffer_64(dst_argb_opt, kStride * height); + srandom(time(NULL)); + for (int i = 0; i < kStride * height; ++i) { + src_argb[i + off] = (random() & 0xff); + } + memset(dst_argb_c, 0, kStride * height); + memset(dst_argb_opt, 0, kStride * height); + + MaskCpuFlags(0); + ARGBAttenuate(src_argb + off, kStride, + dst_argb_c, kStride, + width, invert * height); + MaskCpuFlags(-1); + for (int i = 0; i < benchmark_iterations; ++i) { + ARGBAttenuate(src_argb + off, kStride, + dst_argb_opt, kStride, + width, invert * height); + } + int max_diff = 0; + for (int i = 0; i < kStride * height; ++i) { + int abs_diff = + abs(static_cast(dst_argb_c[i]) - + static_cast(dst_argb_opt[i])); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + free_aligned_buffer_64(src_argb) + free_aligned_buffer_64(dst_argb_c) + free_aligned_buffer_64(dst_argb_opt) + return max_diff; +} + +TEST_F(libyuvTest, ARGBAttenuate_Any) { + int max_diff = TestAttenuateI(benchmark_width_ - 1, benchmark_height_, + benchmark_iterations_, +1, 0); + EXPECT_LE(max_diff, 2); +} + +TEST_F(libyuvTest, ARGBAttenuate_Unaligned) { + int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_, + benchmark_iterations_, +1, 1); + EXPECT_LE(max_diff, 2); +} + +TEST_F(libyuvTest, ARGBAttenuate_Invert) { + int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_, + benchmark_iterations_, -1, 0); + EXPECT_LE(max_diff, 2); +} + +TEST_F(libyuvTest, ARGBAttenuate_Opt) { + int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_, + benchmark_iterations_, +1, 0); + EXPECT_LE(max_diff, 2); +} + TEST_F(libyuvTest, TestARGBComputeCumulativeSum) { SIMD_ALIGNED(uint8 orig_pixels[16][16][4]); SIMD_ALIGNED(int32 added_pixels[16][16][4]); @@ -632,7 +695,7 @@ TEST_F(libyuvTest, ARGBInterpolate##TERP##N) { \ #define TESTINTERPOLATE(TERP) \ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \ - benchmark_width_ - 4, TERP, 1, _Any, +, 0) \ + benchmark_width_ - 1, TERP, 1, _Any, +, 0) \ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \ benchmark_width_, TERP, 1, _Unaligned, +, 1) \ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \ @@ -648,42 +711,38 @@ TESTINTERPOLATE(255) static int TestBlend(int width, int height, int benchmark_iterations, int invert, int off) { - const int BPP_A = 4; - const int STRIDE_A = 1; - const int BPP_B = 4; - const int STRIDE_B = 1; - const int kStrideA = (width * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; - const int kStrideB = (width * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; - align_buffer_64(src_argb_a, kStrideA * height + off); - align_buffer_64(src_argb_b, kStrideA * height + off); - align_buffer_64(dst_argb_c, kStrideB * height); - align_buffer_64(dst_argb_opt, kStrideB * height); + const int kBpp = 4; + const int kStride = width * kBpp; + align_buffer_64(src_argb_a, kStride * height + off); + align_buffer_64(src_argb_b, kStride * height + off); + align_buffer_64(dst_argb_c, kStride * height); + align_buffer_64(dst_argb_opt, kStride * height); srandom(time(NULL)); - for (int i = 0; i < kStrideA * height; ++i) { + for (int i = 0; i < kStride * height; ++i) { src_argb_a[i + off] = (random() & 0xff); src_argb_b[i + off] = (random() & 0xff); } - ARGBAttenuate(src_argb_a + off, kStrideA, src_argb_a + off, kStrideA, width, + ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width, height); - ARGBAttenuate(src_argb_b + off, kStrideA, src_argb_b + off, kStrideA, width, + ARGBAttenuate(src_argb_b + off, kStride, src_argb_b + off, kStride, width, height); - memset(dst_argb_c, 255, kStrideB * height); - memset(dst_argb_opt, 255, kStrideB * height); + memset(dst_argb_c, 255, kStride * height); + memset(dst_argb_opt, 255, kStride * height); MaskCpuFlags(0); - ARGBBlend(src_argb_a + off, kStrideA, - src_argb_b + off, kStrideA, - dst_argb_c, kStrideB, + ARGBBlend(src_argb_a + off, kStride, + src_argb_b + off, kStride, + dst_argb_c, kStride, width, invert * height); MaskCpuFlags(-1); for (int i = 0; i < benchmark_iterations; ++i) { - ARGBBlend(src_argb_a + off, kStrideA, - src_argb_b + off, kStrideA, - dst_argb_opt, kStrideB, + ARGBBlend(src_argb_a + off, kStride, + src_argb_b + off, kStride, + dst_argb_opt, kStride, width, invert * height); } int max_diff = 0; - for (int i = 0; i < kStrideB * height; ++i) { + for (int i = 0; i < kStride * height; ++i) { int abs_diff = abs(static_cast(dst_argb_c[i]) - static_cast(dst_argb_opt[i]));