From 860cc0357a4b0d224be673c0ccfad192745a4192 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 3 Nov 2015 19:21:36 -0800 Subject: [PATCH] Neon versions of I420AlphaToARGB Add alpha version of YUV to RGB to neon code for ARMv7 and aarch64. For other YUV to RGB conversions, hoist alpha set to 255 out of loop. TBR=harryjin@google.com BUG=libyuv:516 Review URL: https://codereview.chromium.org/1413763017 . --- include/libyuv/row.h | 15 +++++++++ source/row_any.cc | 5 ++- source/row_neon.cc | 51 +++++++++++++++++++++++++------ source/row_neon64.cc | 73 ++++++++++++++++++++++++++++++++------------ 4 files changed, 114 insertions(+), 30 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 4e02a212d..5de839df1 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -258,6 +258,7 @@ extern "C" { // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_I422ALPHATOARGBROW_NEON #define HAS_ABGRTOUVROW_NEON #define HAS_ABGRTOYROW_NEON #define HAS_ARGB1555TOARGBROW_NEON @@ -553,6 +554,13 @@ void I422ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422AlphaToARGBRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1533,6 +1541,13 @@ void I422ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422AlphaToARGBRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + const uint8* src_a, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I411ToARGBRow_Any_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, diff --git a/source/row_any.cc b/source/row_any.cc index 56147e13f..fe7f4813f 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -48,7 +48,10 @@ extern "C" { ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7) #endif #ifdef HAS_I422ALPHATOARGBROW_AVX2 -ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 7) +ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15) +#endif +#ifdef HAS_I422ALPHATOARGBROW_NEON +ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7) #endif #undef ANY41C diff --git a/source/row_neon.cc b/source/row_neon.cc index ae8c15062..256785ddd 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -142,11 +142,11 @@ void I444ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READYUV444 YUVTORGB "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" @@ -172,11 +172,11 @@ void I422ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READYUV422 YUVTORGB "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" @@ -194,6 +194,39 @@ void I422ToARGBRow_NEON(const uint8* src_y, ); } +void I422AlphaToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + const uint8* src_a, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "1: \n" + READYUV422 + YUVTORGB + "subs %5, %5, #8 \n" + MEMACCESS(3) + "vld1.8 {d23}, [%3]! \n" + MEMACCESS(4) + "vst4.8 {d20, d21, d22, d23}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + void I411ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -202,11 +235,11 @@ void I411ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READYUV411 YUVTORGB "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" @@ -232,11 +265,11 @@ void I422ToRGBARow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "vmov.u8 d19, #255 \n" "1: \n" READYUV422 YUVTORGB "subs %4, %4, #8 \n" - "vmov.u8 d19, #255 \n" MEMACCESS(3) "vst4.8 {d19, d20, d21, d22}, [%3]! \n" "bgt 1b \n" @@ -417,11 +450,11 @@ void I400ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READYUV400 YUVTORGB "subs %2, %2, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" @@ -466,11 +499,11 @@ void NV12ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READNV12 YUVTORGB "subs %3, %3, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(2) "vst4.8 {d20, d21, d22, d23}, [%2]! \n" "bgt 1b \n" @@ -494,11 +527,11 @@ void NV21ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READNV21 YUVTORGB "subs %3, %3, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(2) "vst4.8 {d20, d21, d22, d23}, [%2]! \n" "bgt 1b \n" @@ -549,11 +582,11 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, int width) { asm volatile ( YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READYUY2 YUVTORGB "subs %2, %2, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" @@ -575,11 +608,11 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, int width) { asm volatile ( YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READUYVY YUVTORGB "subs %2, %2, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 194a566bb..a9801f2e4 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -127,15 +127,6 @@ extern "C" { "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \ -// TODO(fbarchard): Use structure for constants like 32 bit code. -#define RGBTOUV_SETUP_REG \ - "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ - "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ - "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ - "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ - "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ - "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ - #ifdef HAS_I444TOARGBROW_NEON void I444ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, @@ -145,11 +136,11 @@ void I444ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "movi v23.8b, #255 \n" /* A */ "1: \n" READYUV444 YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" @@ -168,7 +159,6 @@ void I444ToARGBRow_NEON(const uint8* src_y, } #endif // HAS_I444TOARGBROW_NEON -// TODO(fbarchard): Switch to Matrix version of this function. #ifdef HAS_I422TOARGBROW_NEON void I422ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, @@ -178,11 +168,11 @@ void I422ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "movi v23.8b, #255 \n" /* A */ "1: \n" READYUV422 YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" @@ -201,6 +191,41 @@ void I422ToARGBRow_NEON(const uint8* src_y, } #endif // HAS_I422TOARGBROW_NEON +#ifdef HAS_I422ALPHATOARGBROW_NEON +void I422AlphaToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + const uint8* src_a, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "1: \n" + READYUV422 + YUVTORGB(v22, v21, v20) + MEMACCESS(3) + "ld1 {v23.8b}, [%3], #8 \n" + "subs %w5, %w5, #8 \n" + MEMACCESS(4) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422ALPHATOARGBROW_NEON + #ifdef HAS_I411TOARGBROW_NEON void I411ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, @@ -210,11 +235,11 @@ void I411ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "movi v23.8b, #255 \n" /* A */ "1: \n" READYUV411 YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" @@ -242,11 +267,11 @@ void I422ToRGBARow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "movi v20.8b, #255 \n" /* A */ "1: \n" READYUV422 YUVTORGB(v23, v22, v21) "subs %w4, %w4, #8 \n" - "movi v20.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" @@ -353,11 +378,11 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "movi v23.8b, #255 \n" "1: \n" READYUV422 YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - "movi v23.8b, #255 \n" ARGBTOARGB1555 MEMACCESS(3) "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. @@ -428,11 +453,11 @@ void I400ToARGBRow_NEON(const uint8* src_y, int64 width64 = (int64)(width); asm volatile ( YUVTORGB_SETUP + "movi v23.8b, #255 \n" "1: \n" READYUV400 YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" - "movi v23.8b, #255 \n" MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" @@ -481,11 +506,11 @@ void NV12ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "movi v23.8b, #255 \n" "1: \n" READNV12 YUVTORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" - "movi v23.8b, #255 \n" MEMACCESS(2) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "b.gt 1b \n" @@ -511,11 +536,11 @@ void NV21ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "movi v23.8b, #255 \n" "1: \n" READNV21 YUVTORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" - "movi v23.8b, #255 \n" MEMACCESS(2) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "b.gt 1b \n" @@ -571,11 +596,11 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, int64 width64 = (int64)(width); asm volatile ( YUVTORGB_SETUP + "movi v23.8b, #255 \n" "1: \n" READYUY2 YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" - "movi v23.8b, #255 \n" MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" @@ -600,11 +625,11 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, int64 width64 = (int64)(width); asm volatile ( YUVTORGB_SETUP + "movi v23.8b, #255 \n" "1: \n" READUYVY YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" - "movi v23.8b, #255 \n" MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" "b.gt 1b \n" @@ -1444,6 +1469,14 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, } #endif // HAS_ARGBTOUV444ROW_NEON +#define RGBTOUV_SETUP_REG \ + "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ + "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ + "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ + "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ + "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ + "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ + // 16x1 pixels -> 8x1. width is number of argb pixels. e.g. 16. #ifdef HAS_ARGBTOUV422ROW_NEON void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,