From e91bdaca3674830570cbb2aaab6d5c939f56dee4 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Tue, 9 Oct 2012 21:09:33 +0000 Subject: [PATCH] Move HalfRow to row_win and port to row_neon BUG=118 TEST=libyuvTest.I420ToI422_OptVsC (247 ms) Review URL: https://webrtc-codereview.appspot.com/855012 git-svn-id: http://libyuv.googlecode.com/svn/trunk@400 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- include/libyuv/row.h | 12 ++++- source/convert.cc | 79 +++++------------------------ source/row_common.cc | 7 +++ source/row_neon.cc | 21 ++++++++ source/row_posix.cc | 97 ++++++++++++++++++++++++++++++++++++ source/row_win.cc | 24 +++++++++ unit_test/compare_test.cc | 28 ++++++++--- unit_test/planar_test.cc | 101 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 292 insertions(+), 77 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index ebbc4572c..4c16269bb 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -53,11 +53,13 @@ extern "C" { #define HAS_BGRATOYROW_SSSE3 #define HAS_COPYROW_SSE2 #define HAS_COPYROW_X86 +#define HAS_HALFROW_SSE2 #define HAS_I400TOARGBROW_SSE2 #define HAS_I411TOARGBROW_SSSE3 #define HAS_I422TOABGRROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3 #define HAS_I422TOBGRAROW_SSSE3 +#define HAS_I422TORGBAROW_SSSE3 #define HAS_I444TOARGBROW_SSSE3 #define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORROWUV_SSSE3 @@ -96,7 +98,6 @@ extern "C" { #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_ABGRTOARGBROW_SSSE3 #define HAS_ARGBCOLORTABLEROW_X86 -#define HAS_I422TORGBAROW_SSSE3 #define HAS_RGBATOARGBROW_SSSE3 #define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOYROW_SSSE3 @@ -116,6 +117,7 @@ extern "C" { // The following are available on Neon platforms #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #define HAS_COPYROW_NEON +#define HAS_HALFROW_NEON #define HAS_I422TOABGRROW_NEON #define HAS_I422TOARGBROW_NEON #define HAS_I422TOBGRAROW_NEON @@ -750,6 +752,14 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction); +void HalfRow_C(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix); +void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix); +void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix); + + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/convert.cc b/source/convert.cc index 0882c92ba..0f21b03fa 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -62,66 +62,6 @@ int I420Copy(const uint8* src_y, int src_stride_y, return 0; } -// Move to row_win etc. -#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) -#define HAS_HALFROW_SSE2 -__declspec(naked) __declspec(align(16)) -static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // src_uv_stride - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - sub edi, eax - - align 16 - convertloop: - movdqa xmm0, [eax] - pavgb xmm0, [eax + edx] - sub ecx, 16 - movdqa [eax + edi], xmm0 - lea eax, [eax + 16] - jg convertloop - pop edi - ret - } -} - -#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) -#define HAS_HALFROW_SSE2 -static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix) { - asm volatile ( - "sub %0,%1 \n" - ".p2align 4 \n" -"1: \n" - "movdqa (%0),%%xmm0 \n" - "pavgb (%0,%3),%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0,(%0,%1) \n" - "lea 0x10(%0),%0 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_uv), // %1 - "+r"(pix) // %2 - : "r"(static_cast(src_uv_stride)) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0" -#endif -); -} -#endif - -static void HalfRow_C(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix) { - for (int x = 0; x < pix; ++x) { - dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; - } -} - LIBYUV_API int I422ToI420(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, @@ -149,14 +89,17 @@ int I422ToI420(const uint8* src_y, int src_stride_y, void (*HalfRow)(const uint8* src_uv, int src_uv_stride, uint8* dst_uv, int pix) = HalfRow_C; #if defined(HAS_HALFROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(halfwidth, 16) && + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 16) && IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) && IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) && IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) && IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { HalfRow = HalfRow_SSE2; } +#elif defined(HAS_HALFROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) { + HalfRow = HalfRow_NEON; + } #endif // Copy Y plane @@ -296,12 +239,12 @@ int I411ToI420(const uint8* src_y, int src_stride_y, // Negative height means invert the image. if (height < 0) { height = -height; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_u = dst_u + (height - 1) * dst_stride_u; - dst_v = dst_v + (height - 1) * dst_stride_v; - dst_stride_y = -dst_stride_y; - dst_stride_u = -dst_stride_u; - dst_stride_v = -dst_stride_v; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; } // Copy Y plane diff --git a/source/row_common.cc b/source/row_common.cc index e0e426cd8..83c0d697b 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1240,6 +1240,13 @@ void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, } while (dst_ptr < end); } +void HalfRow_C(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + for (int x = 0; x < pix; ++x) { + dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_neon.cc b/source/row_neon.cc index 19a783305..200538de5 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -821,6 +821,27 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, } #endif // HAS_UYVYTOYROW_NEON +void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %0 \n" + "1: \n" + "vld1.u8 {q0}, [%0]! \n" // load row 1 16 pixels. + "subs %3, %3, #16 \n" // 16 processed per loop + "vld1.u8 {q1}, [%1]! \n" // load row 2 16 pixels. + "vrhadd.u8 q0, q1 \n" // average row 1 and 2 + "vst1.u8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(src_uv_stride), // %1 + "+r"(dst_uv), // %2 + "+r"(pix) // %3 + : + : "memory", "cc", "q0", "q1" // Clobber List + ); +} + #endif // __ARM_NEON__ #ifdef __cplusplus diff --git a/source/row_posix.cc b/source/row_posix.cc index 33149dada..74783d370 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -1816,6 +1816,43 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, ); } +void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgba_buf, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READYUV422 + YUVTORGB + "pcmpeqb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm2,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "movdqa %%xmm5,%%xmm0 \n" + "punpcklwd %%xmm1,%%xmm5 \n" + "punpckhwd %%xmm1,%%xmm0 \n" + "movdqa %%xmm5,(%[argb_buf]) \n" + "movdqa %%xmm0,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(rgba_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1888,6 +1925,44 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, #endif ); } + +void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgba_buf, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READYUV422 + YUVTORGB + "pcmpeqb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm2,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "movdqa %%xmm5,%%xmm0 \n" + "punpcklwd %%xmm1,%%xmm5 \n" + "punpckhwd %%xmm1,%%xmm0 \n" + "movdqa %%xmm5,(%[argb_buf]) \n" + "movdqa %%xmm0,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(rgba_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + #endif // HAS_I422TOARGBROW_SSSE3 #ifdef HAS_YTOARGBROW_SSE2 @@ -3654,6 +3729,28 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ); } +void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + asm volatile ( + "sub %0,%1 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "pavgb (%0,%3),%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%0,%1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(pix) // %2 + : "r"(static_cast(src_uv_stride)) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0" +#endif + ); +} #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/row_win.cc b/source/row_win.cc index de70b9435..8a29f24bb 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4193,6 +4193,30 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, } } +__declspec(naked) __declspec(align(16)) +void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // src_uv_stride + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + sub edi, eax + + align 16 + convertloop: + movdqa xmm0, [eax] + pavgb xmm0, [eax + edx] + sub ecx, 16 + movdqa [eax + edi], xmm0 + lea eax, [eax + 16] + jg convertloop + pop edi + ret + } +} + #endif // _M_IX86 #ifdef __cplusplus diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc index 8a49a612f..f6086f03d 100644 --- a/unit_test/compare_test.cc +++ b/unit_test/compare_test.cc @@ -108,19 +108,25 @@ TEST_F(libyuvTest, BenchmarkSumSquareError_C) { align_buffer_16(src_a, kMaxWidth) align_buffer_16(src_b, kMaxWidth) + MaskCpuFlags(kCpuInitialized); + + memcpy(src_a, "test0123test4567", 16); + memcpy(src_b, "tick0123tock4567", 16); + uint64 h1 = ComputeSumSquareError(src_a, src_b, 16); + EXPECT_EQ(790u, h1); + for (int i = 0; i < kMaxWidth; ++i) { src_a[i] = i; src_b[i] = i; } - MaskCpuFlags(kCpuInitialized); - for (int i = 0; i < benchmark_iterations_; ++i) { - ComputeSumSquareError(src_a, src_b, kMaxWidth); + int count = benchmark_iterations_ * 1280 * 720 / kMaxWidth; + for (int i = 0; i < count; ++i) { + h1 = ComputeSumSquareError(src_a, src_b, kMaxWidth); } MaskCpuFlags(-1); - - EXPECT_EQ(0, 0); + EXPECT_EQ(h1, 0); free_aligned_buffer_16(src_a) free_aligned_buffer_16(src_b) @@ -131,16 +137,22 @@ TEST_F(libyuvTest, BenchmarkSumSquareError_OPT) { align_buffer_16(src_a, kMaxWidth) align_buffer_16(src_b, kMaxWidth) + memcpy(src_a, "test0123test4567", 16); + memcpy(src_b, "tick0123tock4567", 16); + uint64 h1 = ComputeSumSquareError(src_a, src_b, 16); + EXPECT_EQ(790u, h1); + for (int i = 0; i < kMaxWidth; ++i) { src_a[i] = i; src_b[i] = i; } - for (int i = 0; i < benchmark_iterations_; ++i) { - ComputeSumSquareError(src_a, src_b, kMaxWidth); + int count = benchmark_iterations_ * 1280 * 720 / kMaxWidth; + for (int i = 0; i < count; ++i) { + h1 = ComputeSumSquareError(src_a, src_b, kMaxWidth); } - EXPECT_EQ(0, 0); + EXPECT_EQ(h1, 0); free_aligned_buffer_16(src_a) free_aligned_buffer_16(src_b) diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index e9053a359..8af0bf6c2 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -28,6 +28,107 @@ namespace libyuv { +#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, N, NEG) \ +TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N##_OptVsC) { \ + const int kWidth = 1280; \ + const int kHeight = 720; \ + align_buffer_16(src_y, kWidth * kHeight); \ + align_buffer_16(src_u, kWidth / SRC_SUBSAMP_X * kHeight / SRC_SUBSAMP_Y); \ + align_buffer_16(src_v, kWidth / SRC_SUBSAMP_X * kHeight / SRC_SUBSAMP_Y); \ + align_buffer_16(dst_y_c, kWidth * kHeight); \ + align_buffer_16(dst_u_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ + align_buffer_16(dst_v_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ + align_buffer_16(dst_y_opt, kWidth * kHeight); \ + align_buffer_16(dst_u_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ + align_buffer_16(dst_v_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ + srandom(time(NULL)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kWidth; ++j) \ + src_y[(i * kWidth) + j] = (random() & 0xff); \ + for (int i = 0; i < kHeight / SRC_SUBSAMP_Y; ++i) \ + for (int j = 0; j < kWidth / SRC_SUBSAMP_X; ++j) { \ + src_u[(i * kWidth / SRC_SUBSAMP_X) + j] = (random() & 0xff); \ + src_v[(i * kWidth / SRC_SUBSAMP_X) + j] = (random() & 0xff); \ + } \ + MaskCpuFlags(kCpuInitialized); \ + SRC_FMT_PLANAR##To##FMT_PLANAR(src_y, kWidth, \ + src_u, kWidth / SRC_SUBSAMP_X, \ + src_v, kWidth / SRC_SUBSAMP_X, \ + dst_y_c, kWidth, \ + dst_u_c, kWidth / SUBSAMP_X, \ + dst_v_c, kWidth / SUBSAMP_X, \ + kWidth, NEG kHeight); \ + MaskCpuFlags(-1); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + SRC_FMT_PLANAR##To##FMT_PLANAR(src_y, kWidth, \ + src_u, kWidth / SRC_SUBSAMP_X, \ + src_v, kWidth / SRC_SUBSAMP_X, \ + dst_y_opt, kWidth, \ + dst_u_opt, kWidth / SUBSAMP_X, \ + dst_v_opt, kWidth / SUBSAMP_X, \ + kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + int abs_diff = \ + abs(static_cast(dst_y_c[i * kWidth + j]) - \ + static_cast(dst_y_opt[i * kWidth + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 2); \ + for (int i = 0; i < kHeight / SUBSAMP_Y; ++i) { \ + for (int j = 0; j < kWidth / SUBSAMP_X; ++j) { \ + int abs_diff = \ + abs(static_cast(dst_u_c[i * kWidth / SUBSAMP_X + j]) - \ + static_cast(dst_u_opt[i * kWidth / SUBSAMP_X + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 2); \ + for (int i = 0; i < kHeight / SUBSAMP_Y; ++i) { \ + for (int j = 0; j < kWidth / SUBSAMP_X; ++j) { \ + int abs_diff = \ + abs(static_cast(dst_v_c[i * kWidth / SUBSAMP_X + j]) - \ + static_cast(dst_v_opt[i * kWidth / SUBSAMP_X + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 2); \ + free_aligned_buffer_16(dst_y_c) \ + free_aligned_buffer_16(dst_u_c) \ + free_aligned_buffer_16(dst_v_c) \ + free_aligned_buffer_16(dst_y_opt) \ + free_aligned_buffer_16(dst_u_opt) \ + free_aligned_buffer_16(dst_v_opt) \ + free_aligned_buffer_16(src_y) \ + free_aligned_buffer_16(src_u) \ + free_aligned_buffer_16(src_v) \ +} + +#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ + TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, , +) \ + TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, Invert, -) \ + +TESTPLANARTOP(I420, 2, 2, I420, 2, 2) +TESTPLANARTOP(I422, 2, 1, I420, 2, 2) +TESTPLANARTOP(I444, 1, 1, I420, 2, 2) +TESTPLANARTOP(I411, 4, 1, I420, 2, 2) +TESTPLANARTOP(I420, 2, 2, I422, 2, 1) +TESTPLANARTOP(I420, 2, 2, I444, 1, 1) +TESTPLANARTOP(I420, 2, 2, I411, 4, 1) + #define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, N, NEG) \ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) { \ const int kWidth = 1280; \