From 82069e7a7108a956d0d78f7e923851925b2827d4 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Fri, 5 Oct 2012 18:27:23 +0000 Subject: [PATCH] pld and palign for performance consistency. BUG=113 TEST=unittest run on linux neon Review URL: https://webrtc-codereview.appspot.com/860008 git-svn-id: http://libyuv.googlecode.com/svn/trunk@394 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/row_neon.cc | 96 +++++++++++++++++++++++++++------------- 3 files changed, 68 insertions(+), 32 deletions(-) diff --git a/README.chromium b/README.chromium index c0c0b5560..84fc041c5 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 393 +Version: 394 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index f5b72527f..6a028df60 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 393 +#define LIBYUV_VERSION 394 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_neon.cc b/source/row_neon.cc index 14455fd19..ced88c43c 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -68,6 +68,7 @@ void I422ToARGBRow_NEON(const uint8* y_buf, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" + ".p2align 2 \n" "1: \n" YUV422TORGB "vmov.u8 d21, d16 \n" @@ -100,6 +101,7 @@ void I422ToBGRARow_NEON(const uint8* y_buf, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" + ".p2align 2 \n" "1: \n" YUV422TORGB "vswp.u8 d20, d22 \n" @@ -133,6 +135,7 @@ void I422ToABGRRow_NEON(const uint8* y_buf, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" + ".p2align 2 \n" "1: \n" YUV422TORGB "vswp.u8 d20, d22 \n" @@ -166,6 +169,7 @@ void I422ToRGBARow_NEON(const uint8* y_buf, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" + ".p2align 2 \n" "1: \n" YUV422TORGB "vmov.u8 d21, d16 \n" @@ -191,11 +195,12 @@ void I422ToRGBARow_NEON(const uint8* y_buf, // Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels. void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( + ".p2align 2 \n" "1: \n" "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pairs of UV - "subs %3, %3, #16 \n" // 16 processed per loop "vst1.u8 {q0}, [%1]! \n" // store U "vst1.u8 {q1}, [%2]! \n" // Store V + "subs %3, %3, #16 \n" // 16 processed per loop "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 @@ -211,11 +216,12 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { // Copy multiple of 64 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile ( + ".p2align 2 \n" "1: \n" - "pld [%0, #0xC0] \n" // preload + "pld [%0, #192] \n" // preload "vldm %0!,{q0, q1, q2, q3} \n" // load 64 - "subs %2, %2, #64 \n" // 64 processed per loop "vstm %1!,{q0, q1, q2, q3} \n" // store 64 + "subs %2, %2, #64 \n" // 64 processed per loop "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -255,12 +261,13 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { // the bytes in the entire 128 bits in one go. // because of the inability to mirror the entire 128 bits // mirror the writing out of the two 64 bit segments. - "1: \n" - "vld1.8 {q0}, [%0]! \n" // src += 16 - "vrev64.8 q0, q0 \n" - "vst1.8 {d1}, [%1]! \n" - "vst1.8 {d0}, [%1], r3 \n" // dst -= 16 - "subs %2, #16 \n" + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // src += 16 + "vrev64.8 q0, q0 \n" + "vst1.8 {d1}, [%1]! \n" + "vst1.8 {d0}, [%1], r3 \n" // dst -= 16 + "subs %2, #16 \n" "bge 1b \n" // add 16 back to the counter. if the result is 0 there is no @@ -325,13 +332,14 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { "sub %3, #8 \n" // mirror the bytes in the 64 bit segments - "1: \n" - "vld2.8 {d0, d1}, [%0]! \n" // src += 16 - "vrev64.8 q0, q0 \n" - "vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8 - "vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8 - "subs %3, #8 \n" - "bge 1b \n" + ".p2align 2 \n" + "1: \n" + "vld2.8 {d0, d1}, [%0]! \n" // src += 16 + "vrev64.8 q0, q0 \n" + "vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8 + "vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8 + "subs %3, #8 \n" + "bge 1b \n" // add 8 back to the counter. if the result is 0 there is no // residuals so return @@ -363,12 +371,14 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { #ifdef HAS_BGRATOARGBROW_NEON void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) { asm volatile ( + ".p2align 2 \n" "1: \n" + "pld [%0, #192] \n" // preload "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. - "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d2 \n" // swap G, R "vswp.u8 d0, d3 \n" // swap B, A "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. "bgt 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_argb), // %1 @@ -382,11 +392,13 @@ void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) { #ifdef HAS_ABGRTOARGBROW_NEON void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix) { asm volatile ( + ".p2align 2 \n" "1: \n" + "pld [%0, #192] \n" // preload "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. - "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d0, d2 \n" // swap R, B "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. "bgt 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_argb), // %1 @@ -400,11 +412,13 @@ void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix) { #ifdef HAS_RGBATOARGBROW_NEON void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix) { asm volatile ( + ".p2align 2 \n" "1: \n" + "pld [%0, #192] \n" // preload "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. - "subs %2, %2, #8 \n" // 8 processed per loop. "vmov.u8 d4, d0 \n" // move A after RGB "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. "bgt 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_argb), // %1 @@ -419,10 +433,12 @@ void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix) { void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { asm volatile ( "vmov.u8 d4, #255 \n" // Alpha + ".p2align 2 \n" "1: \n" + "pld [%0, #192] \n" // preload "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 @@ -437,11 +453,13 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { asm volatile ( "vmov.u8 d4, #255 \n" // Alpha + ".p2align 2 \n" "1: \n" + "pld [%0, #192] \n" // preload "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d3 \n" // swap R, B "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 @@ -455,11 +473,13 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { #ifdef HAS_ARGBTORGBAROW_NEON void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) { asm volatile ( + ".p2align 2 \n" "1: \n" + "pld [%0, #192] \n" // preload "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. "vmov.u8 d0, d4 \n" // move A before RGB. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA. + "subs %2, %2, #8 \n" // 8 processed per loop. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgba), // %1 @@ -473,10 +493,12 @@ void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) { #ifdef HAS_ARGBTORGB24ROW_NEON void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { asm volatile ( + ".p2align 2 \n" "1: \n" + "pld [%0, #192] \n" // preload "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 @@ -490,11 +512,13 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { #ifdef HAS_ARGBTORAWROW_NEON void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { asm volatile ( + ".p2align 2 \n" "1: \n" + "pld [%0, #192] \n" // preload "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "vswp.u8 d1, d3 \n" // swap R, B - "subs %2, %2, #8 \n" // 8 processed per loop. "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_raw), // %1 @@ -508,10 +532,12 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { #ifdef HAS_YUY2TOYROW_NEON void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { asm volatile ( + ".p2align 2 \n" "1: \n" + "pld [%0, #192] \n" // preload "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. - "subs %2, %2, #16 \n" // 16 processed per loop. "vst1.u8 {q0}, [%1]! \n" // store 16 pixels of Y. + "subs %2, %2, #16 \n" // 16 processed per loop. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 @@ -525,10 +551,12 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { #ifdef HAS_UYVYTOYROW_NEON void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( + ".p2align 2 \n" "1: \n" + "pld [%0, #192] \n" // preload "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. - "subs %2, %2, #16 \n" // 16 processed per loop. "vst1.u8 {q1}, [%1]! \n" // store 16 pixels of Y. + "subs %2, %2, #16 \n" // 16 processed per loop. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 @@ -546,11 +574,13 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( + ".p2align 2 \n" "1: \n" + "pld [%0, #192] \n" // preload "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "vst1.u8 {d1}, [%1]! \n" // store 8 U. "vst1.u8 {d3}, [%2]! \n" // store 8 V. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 @@ -566,11 +596,13 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( + ".p2align 2 \n" "1: \n" + "pld [%0, #192] \n" // preload "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "vst1.u8 {d0}, [%1]! \n" // store 8 U. "vst1.u8 {d2}, [%2]! \n" // store 8 V. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 @@ -587,14 +619,16 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "adds %1, %0, %1 \n" // stride + src_yuy2 + ".p2align 2 \n" "1: \n" + "pld [%0, #192] \n" // preload "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "vrhadd.u8 d1, d1, d5 \n" // average rows of U "vrhadd.u8 d3, d3, d7 \n" // average rows of V "vst1.u8 {d1}, [%2]! \n" // store 8 U. "vst1.u8 {d3}, [%3]! \n" // store 8 V. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(stride_yuy2), // %1 @@ -612,14 +646,16 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "adds %1, %0, %1 \n" // stride + src_uyvy + ".p2align 2 \n" "1: \n" + "pld [%0, #192] \n" // preload "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "vrhadd.u8 d0, d0, d4 \n" // average rows of U "vrhadd.u8 d2, d2, d6 \n" // average rows of V "vst1.u8 {d0}, [%2]! \n" // store 8 U. "vst1.u8 {d2}, [%3]! \n" // store 8 V. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(stride_uyvy), // %1