diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 4e02a212d..5de839df1 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -258,6 +258,7 @@ extern "C" { // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_I422ALPHATOARGBROW_NEON #define HAS_ABGRTOUVROW_NEON #define HAS_ABGRTOYROW_NEON #define HAS_ARGB1555TOARGBROW_NEON @@ -553,6 +554,13 @@ void I422ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422AlphaToARGBRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1533,6 +1541,13 @@ void I422ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422AlphaToARGBRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + const uint8* src_a, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I411ToARGBRow_Any_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, diff --git a/source/row_any.cc b/source/row_any.cc index 56147e13f..fe7f4813f 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -48,7 +48,10 @@ extern "C" { ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7) #endif #ifdef HAS_I422ALPHATOARGBROW_AVX2 -ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 7) +ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15) +#endif +#ifdef HAS_I422ALPHATOARGBROW_NEON +ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7) #endif #undef ANY41C diff --git a/source/row_neon.cc b/source/row_neon.cc index ae8c15062..256785ddd 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -142,11 +142,11 @@ void I444ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READYUV444 YUVTORGB "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" @@ -172,11 +172,11 @@ void I422ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READYUV422 YUVTORGB "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" @@ -194,6 +194,39 @@ void I422ToARGBRow_NEON(const uint8* src_y, ); } +void I422AlphaToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + const uint8* src_a, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "1: \n" + READYUV422 + YUVTORGB + "subs %5, %5, #8 \n" + MEMACCESS(3) + "vld1.8 {d23}, [%3]! \n" + MEMACCESS(4) + "vst4.8 {d20, d21, d22, d23}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + void I411ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -202,11 +235,11 @@ void I411ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READYUV411 YUVTORGB "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" @@ -232,11 +265,11 @@ void I422ToRGBARow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "vmov.u8 d19, #255 \n" "1: \n" READYUV422 YUVTORGB "subs %4, %4, #8 \n" - "vmov.u8 d19, #255 \n" MEMACCESS(3) "vst4.8 {d19, d20, d21, d22}, [%3]! \n" "bgt 1b \n" @@ -417,11 +450,11 @@ void I400ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READYUV400 YUVTORGB "subs %2, %2, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" @@ -466,11 +499,11 @@ void NV12ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READNV12 YUVTORGB "subs %3, %3, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(2) "vst4.8 {d20, d21, d22, d23}, [%2]! \n" "bgt 1b \n" @@ -494,11 +527,11 @@ void NV21ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READNV21 YUVTORGB "subs %3, %3, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(2) "vst4.8 {d20, d21, d22, d23}, [%2]! \n" "bgt 1b \n" @@ -549,11 +582,11 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, int width) { asm volatile ( YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READYUY2 YUVTORGB "subs %2, %2, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" @@ -575,11 +608,11 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, int width) { asm volatile ( YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" "1: \n" READUYVY YUVTORGB "subs %2, %2, #8 \n" - "vmov.u8 d23, #255 \n" MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 194a566bb..a9801f2e4 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -127,15 +127,6 @@ extern "C" { "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \ -// TODO(fbarchard): Use structure for constants like 32 bit code. -#define RGBTOUV_SETUP_REG \ - "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ - "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ - "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ - "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ - "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ - "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ - #ifdef HAS_I444TOARGBROW_NEON void I444ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, @@ -145,11 +136,11 @@ void I444ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "movi v23.8b, #255 \n" /* A */ "1: \n" READYUV444 YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" @@ -168,7 +159,6 @@ void I444ToARGBRow_NEON(const uint8* src_y, } #endif // HAS_I444TOARGBROW_NEON -// TODO(fbarchard): Switch to Matrix version of this function. #ifdef HAS_I422TOARGBROW_NEON void I422ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, @@ -178,11 +168,11 @@ void I422ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "movi v23.8b, #255 \n" /* A */ "1: \n" READYUV422 YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" @@ -201,6 +191,41 @@ void I422ToARGBRow_NEON(const uint8* src_y, } #endif // HAS_I422TOARGBROW_NEON +#ifdef HAS_I422ALPHATOARGBROW_NEON +void I422AlphaToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + const uint8* src_a, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "1: \n" + READYUV422 + YUVTORGB(v22, v21, v20) + MEMACCESS(3) + "ld1 {v23.8b}, [%3], #8 \n" + "subs %w5, %w5, #8 \n" + MEMACCESS(4) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422ALPHATOARGBROW_NEON + #ifdef HAS_I411TOARGBROW_NEON void I411ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, @@ -210,11 +235,11 @@ void I411ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "movi v23.8b, #255 \n" /* A */ "1: \n" READYUV411 YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" @@ -242,11 +267,11 @@ void I422ToRGBARow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "movi v20.8b, #255 \n" /* A */ "1: \n" READYUV422 YUVTORGB(v23, v22, v21) "subs %w4, %w4, #8 \n" - "movi v20.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" @@ -353,11 +378,11 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "movi v23.8b, #255 \n" "1: \n" READYUV422 YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - "movi v23.8b, #255 \n" ARGBTOARGB1555 MEMACCESS(3) "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. @@ -428,11 +453,11 @@ void I400ToARGBRow_NEON(const uint8* src_y, int64 width64 = (int64)(width); asm volatile ( YUVTORGB_SETUP + "movi v23.8b, #255 \n" "1: \n" READYUV400 YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" - "movi v23.8b, #255 \n" MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" @@ -481,11 +506,11 @@ void NV12ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "movi v23.8b, #255 \n" "1: \n" READNV12 YUVTORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" - "movi v23.8b, #255 \n" MEMACCESS(2) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "b.gt 1b \n" @@ -511,11 +536,11 @@ void NV21ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUVTORGB_SETUP + "movi v23.8b, #255 \n" "1: \n" READNV21 YUVTORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" - "movi v23.8b, #255 \n" MEMACCESS(2) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "b.gt 1b \n" @@ -571,11 +596,11 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, int64 width64 = (int64)(width); asm volatile ( YUVTORGB_SETUP + "movi v23.8b, #255 \n" "1: \n" READYUY2 YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" - "movi v23.8b, #255 \n" MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" @@ -600,11 +625,11 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, int64 width64 = (int64)(width); asm volatile ( YUVTORGB_SETUP + "movi v23.8b, #255 \n" "1: \n" READUYVY YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" - "movi v23.8b, #255 \n" MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" "b.gt 1b \n" @@ -1444,6 +1469,14 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, } #endif // HAS_ARGBTOUV444ROW_NEON +#define RGBTOUV_SETUP_REG \ + "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ + "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ + "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ + "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ + "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ + "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ + // 16x1 pixels -> 8x1. width is number of argb pixels. e.g. 16. #ifdef HAS_ARGBTOUV422ROW_NEON void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,