diff --git a/README.chromium b/README.chromium index f3510711f..4a239b3cb 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1721 +Version: 1722 License: BSD License File: LICENSE diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 282d5216f..6e207a9c6 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -58,6 +58,7 @@ extern "C" { (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) #define HAS_FIXEDDIV1_X86 #define HAS_FIXEDDIV_X86 +#define HAS_SCALEADDROW_SSE2 #define HAS_SCALEARGBCOLS_SSE2 #define HAS_SCALEARGBCOLSUP2_SSE2 #define HAS_SCALEARGBFILTERCOLS_SSSE3 @@ -69,7 +70,6 @@ extern "C" { #define HAS_SCALEROWDOWN34_SSSE3 #define HAS_SCALEROWDOWN38_SSSE3 #define HAS_SCALEROWDOWN4_SSSE3 -#define HAS_SCALEADDROW_SSE2 #endif // The following are available on all x86 platforms, but @@ -86,7 +86,9 @@ extern "C" { // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) +#define HAS_SCALEADDROW_NEON #define HAS_SCALEARGBCOLS_NEON +#define HAS_SCALEARGBFILTERCOLS_NEON #define HAS_SCALEARGBROWDOWN2_NEON #define HAS_SCALEARGBROWDOWNEVEN_NEON #define HAS_SCALEFILTERCOLS_NEON @@ -94,7 +96,6 @@ extern "C" { #define HAS_SCALEROWDOWN34_NEON #define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEROWDOWN4_NEON -#define HAS_SCALEARGBFILTERCOLS_NEON #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) @@ -113,18 +114,18 @@ extern "C" { #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) #define HAS_FIXEDDIV1_MIPS #define HAS_FIXEDDIV_MIPS +#define HAS_SCALEADDROW_16_MMI +#define HAS_SCALEADDROW_MMI #define HAS_SCALEARGBCOLS_MMI #define HAS_SCALEARGBCOLSUP2_MMI #define HAS_SCALEARGBROWDOWN2_MMI #define HAS_SCALEARGBROWDOWNEVEN_MMI -#define HAS_SCALEROWDOWN2_MMI -#define HAS_SCALEROWDOWN4_MMI -#define HAS_SCALEADDROW_MMI -#define HAS_SCALEADDROW_16_MMI -#define HAS_SCALEROWDOWN2_16_MMI -#define HAS_SCALEROWDOWN4_16_MMI -#define HAS_SCALECOLS_MMI #define HAS_SCALECOLS_16_MMI +#define HAS_SCALECOLS_MMI +#define HAS_SCALEROWDOWN2_16_MMI +#define HAS_SCALEROWDOWN2_MMI +#define HAS_SCALEROWDOWN4_16_MMI +#define HAS_SCALEROWDOWN4_MMI #endif // Scale ARGB vertically with bilinear interpolation. diff --git a/include/libyuv/version.h b/include/libyuv/version.h index cb3241c14..1a38ba7d6 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1721 +#define LIBYUV_VERSION 1722 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 143f75be3..b376a0f38 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -1980,9 +1980,8 @@ int NV12ToRAW(const uint8_t* src_y, int dst_stride_raw, int width, int height) { - return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, - dst_raw, dst_stride_raw, &kYvuI601Constants, - width, height); + return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_raw, + dst_stride_raw, &kYvuI601Constants, width, height); } // Convert NV21 to RAW. @@ -1995,9 +1994,8 @@ int NV21ToRAW(const uint8_t* src_y, int dst_stride_raw, int width, int height) { - return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, - dst_raw, dst_stride_raw, &kYvuI601Constants, - width, height); + return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_raw, + dst_stride_raw, &kYvuI601Constants, width, height); } // Convert M420 to ARGB. diff --git a/source/scale_any.cc b/source/scale_any.cc index 8714c3695..17831372c 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include // For memset/memcpy + #include "libyuv/scale.h" #include "libyuv/scale_row.h" @@ -499,6 +501,45 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MMI, 1) #endif +#ifdef SASIMDONLY +// This also works and uses memcpy and SIMD instead of C, but is slower on ARM + +// Add rows box filter scale down. Using macro from row_any +#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint16_t dst_temp[32]); \ + SIMD_ALIGNED(uint8_t src_temp[32]); \ + memset(dst_temp, 0, 32 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(src_temp, src_ptr + n * SBPP, r * SBPP); \ + memcpy(dst_temp, dst_ptr + n * BPP, r * BPP); \ + ANY_SIMD(src_temp, dst_temp, MASK + 1); \ + memcpy(dst_ptr + n * BPP, dst_temp, r * BPP); \ + } + +#ifdef HAS_SCALEADDROW_SSE2 +SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15) +#endif +#ifdef HAS_SCALEADDROW_AVX2 +SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31) +#endif +#ifdef HAS_SCALEADDROW_NEON +SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15) +#endif +#ifdef HAS_SCALEADDROW_MSA +SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15) +#endif +#ifdef HAS_SCALEADDROW_MMI +SAROW(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, 1, 2, 7) +#endif +#undef SAANY + +#else + // Add rows box filter scale down. #define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \ @@ -526,6 +567,8 @@ SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7) #endif #undef SAANY +#endif // SASIMDONLY + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 459a2995d..46f5ba4cd 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -504,37 +504,25 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"); } -void ScaleAddRows_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - int src_width, - int src_height) { - const uint8_t* src_tmp; +// Add a row of bytes to a row of shorts. Used for box filter. +// Reads 16 bytes and accumulates to 16 shorts at a time. +void ScaleAddRow_NEON(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { asm volatile( "1: \n" - "mov %0, %1 \n" - "mov r12, %5 \n" - "veor q2, q2, q2 \n" - "veor q3, q3, q3 \n" - "2: \n" - // load 16 pixels into q0 - "vld1.8 {q0}, [%0], %3 \n" - "vaddw.u8 q3, q3, d1 \n" - "vaddw.u8 q2, q2, d0 \n" - "subs r12, r12, #1 \n" - "bgt 2b \n" - "vst1.16 {q2, q3}, [%2]! \n" // store pixels - "add %1, %1, #16 \n" - "subs %4, %4, #16 \n" // 16 processed per loop + "vld1.16 {q1, q2}, [%1] \n" // load accumulator + "vld1.8 {q0}, [%0]! \n" // load 16 bytes + "vaddw.u8 q2, q2, d1 \n" // add + "vaddw.u8 q1, q1, d0 \n" + "vst1.16 {q1, q2}, [%1]! \n" // store accumulator + "subs %2, %2, #16 \n" // 16 processed per loop "bgt 1b \n" - : "=&r"(src_tmp), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_ptr), // %2 - "+r"(src_stride), // %3 - "+r"(src_width), // %4 - "+r"(src_height) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 : - : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List + : "memory", "cc", "q0", "q1", "q2" // Clobber List ); } diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 494a9cfbf..f4aed5fc9 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -515,37 +515,25 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, "v19", "v30", "v31", "memory", "cc"); } -void ScaleAddRows_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - int src_width, - int src_height) { - const uint8_t* src_tmp; +// Add a row of bytes to a row of shorts. Used for box filter. +// Reads 16 bytes and accumulates to 16 shorts at a time. +void ScaleAddRow_NEON(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { asm volatile( "1: \n" - "mov %0, %1 \n" - "mov w12, %w5 \n" - "eor v2.16b, v2.16b, v2.16b \n" - "eor v3.16b, v3.16b, v3.16b \n" - "2: \n" - // load 16 pixels into q0 - "ld1 {v0.16b}, [%0], %3 \n" - "uaddw2 v3.8h, v3.8h, v0.16b \n" - "uaddw v2.8h, v2.8h, v0.8b \n" - "subs w12, w12, #1 \n" - "b.gt 2b \n" - "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels - "add %1, %1, #16 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop + "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator + "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes + "uaddw2 v2.8h, v2.8h, v0.16b \n" // add + "uaddw v1.8h, v1.8h, v0.8b \n" + "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator + "subs %w2, %w2, #16 \n" // 16 processed per loop "b.gt 1b \n" - : "=&r"(src_tmp), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_ptr), // %2 - "+r"(src_stride), // %3 - "+r"(src_width), // %4 - "+r"(src_height) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 : - : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List + : "memory", "cc", "v0", "v1", "v2" // Clobber List ); } diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index d50a40c43..d97b4fc72 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -1693,7 +1693,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420) { EXPECT_EQ(0, ret); int half_width = (width + 1) / 2; - int half_height = (height + 1)/ 2; + int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); @@ -1727,7 +1727,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) { EXPECT_EQ(0, ret); int half_width = (width + 1) / 2; - int half_height = (height + 1)/ 2; + int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); @@ -1786,7 +1786,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) { EXPECT_EQ(0, ret); int half_width = (width + 1) / 2; - int half_height = (height + 1)/ 2; + int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); @@ -1816,7 +1816,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_422) { EXPECT_EQ(0, ret); int half_width = (width + 1) / 2; - int half_height = (height + 1)/ 2; + int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); @@ -1846,7 +1846,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) { EXPECT_EQ(0, ret); int half_width = (width + 1) / 2; - int half_height = (height + 1)/ 2; + int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); @@ -1876,7 +1876,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) { EXPECT_EQ(0, ret); int half_width = (width + 1) / 2; - int half_height = (height + 1)/ 2; + int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc index 6a0a58640..94aef60e2 100644 --- a/unit_test/scale_argb_test.cc +++ b/unit_test/scale_argb_test.cc @@ -303,10 +303,10 @@ TEST_FACTOR(3, 1, 3) TEST_SCALETO(ARGBScale, 1, 1) TEST_SCALETO(ARGBScale, 320, 240) -TEST_SCALETO(ARGBScale, 352, 288) TEST_SCALETO(ARGBScale, 569, 480) TEST_SCALETO(ARGBScale, 640, 360) TEST_SCALETO(ARGBScale, 1280, 720) +TEST_SCALETO(ARGBScale, 1920, 1080) #undef TEST_SCALETO1 #undef TEST_SCALETO diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index b8994c304..d97d54a88 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -336,10 +336,10 @@ TEST_FACTOR(3, 1, 3, 0) TEST_SCALETO(Scale, 1, 1) TEST_SCALETO(Scale, 320, 240) -TEST_SCALETO(Scale, 352, 288) TEST_SCALETO(Scale, 569, 480) TEST_SCALETO(Scale, 640, 360) TEST_SCALETO(Scale, 1280, 720) +TEST_SCALETO(Scale, 1920, 1080) #undef TEST_SCALETO1 #undef TEST_SCALETO