diff --git a/include/libyuv/row.h b/include/libyuv/row.h index bc7edafe4..890766ff3 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -284,6 +284,8 @@ extern "C" { (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) #define HAS_ABGRTOAR30ROW_AVX2 +#define HAS_ABGRTOUVROW_AVX2 +#define HAS_ABGRTOYROW_AVX2 #define HAS_ARGBTOAR30ROW_AVX2 #define HAS_ARGBTORAWROW_AVX2 #define HAS_ARGBTORGB24ROW_AVX2 @@ -912,6 +914,8 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width); +void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -940,7 +944,7 @@ void ARGBToUV444Row_MSA(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVRow_MSA(const uint8_t* src_argb0, +void ARGBToUVRow_MSA(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -949,7 +953,7 @@ void ARGBToUV444Row_MMI(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVRow_MMI(const uint8_t* src_argb0, +void ARGBToUVRow_MMI(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -999,32 +1003,32 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVJRow_MSA(const uint8_t* src_rgb0, +void ARGBToUVJRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void BGRAToUVRow_MSA(const uint8_t* src_rgb0, +void BGRAToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void ABGRToUVRow_MSA(const uint8_t* src_rgb0, +void ABGRToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void RGBAToUVRow_MSA(const uint8_t* src_rgb0, +void RGBAToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, +void RGB24ToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void RAWToUVRow_MSA(const uint8_t* src_rgb0, +void RAWToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, @@ -1039,32 +1043,32 @@ void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, +void ARGBToUVJRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void BGRAToUVRow_MMI(const uint8_t* src_rgb0, +void BGRAToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void ABGRToUVRow_MMI(const uint8_t* src_rgb0, +void ABGRToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void RGBAToUVRow_MMI(const uint8_t* src_rgb0, +void RGBAToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, +void RGB24ToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void RAWToUVRow_MMI(const uint8_t* src_rgb0, +void RAWToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, @@ -1096,29 +1100,29 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width); -void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); +void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); +void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); +void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width); -void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width); +void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width); +void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width); +void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width); -void ARGBToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void ARGBToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void BGRAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void ABGRToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void RGBAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void RGB24ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void RAWToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ARGBToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYJRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); +void BGRAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ABGRToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); +void RGBAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); +void RGB24ToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); +void RAWToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width); @@ -1169,37 +1173,42 @@ void ARGB4444ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToUVRow_AVX2(const uint8_t* src_argb0, +void ARGBToUVRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, +void ABGRToUVRow_AVX2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, +void ARGBToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, +void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width); -void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, +void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, uint8_t* dst_v, int width); -void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, +void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, uint8_t* dst_v, int width); -void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, +void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_u, uint8_t* dst_v, @@ -1209,6 +1218,11 @@ void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVRow_Any_AVX2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, @@ -1396,47 +1410,47 @@ void ARGB4444ToUVRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVRow_C(const uint8_t* src_rgb0, +void ARGBToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVJRow_C(const uint8_t* src_rgb0, +void ARGBToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVRow_C(const uint8_t* src_rgb0, +void ARGBToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVJRow_C(const uint8_t* src_rgb0, +void ARGBToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void BGRAToUVRow_C(const uint8_t* src_rgb0, +void BGRAToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void ABGRToUVRow_C(const uint8_t* src_rgb0, +void ABGRToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void RGBAToUVRow_C(const uint8_t* src_rgb0, +void RGBAToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void RGB24ToUVRow_C(const uint8_t* src_rgb0, +void RGB24ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void RAWToUVRow_C(const uint8_t* src_rgb0, +void RAWToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, diff --git a/source/row_any.cc b/source/row_any.cc index da91347b5..9fafff602 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -580,6 +580,9 @@ ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3) #ifdef HAS_ARGBTOYROW_AVX2 ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) #endif +#ifdef HAS_ABGRTOYROW_AVX2 +ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31) +#endif #ifdef HAS_ARGBTOYJROW_AVX2 ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31) #endif @@ -1273,6 +1276,9 @@ ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3) #ifdef HAS_ARGBTOUVROW_AVX2 ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) #endif +#ifdef HAS_ABGRTOUVROW_AVX2 +ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31) +#endif #ifdef HAS_ARGBTOUVJROW_AVX2 ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31) #endif diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 003c9082c..ae672d625 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1154,6 +1154,48 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { } #endif // HAS_ARGBTOYROW_AVX2 +#ifdef HAS_ABGRTOYROW_AVX2 +// Convert 32 ABGR pixels (128 bytes) to 32 Y values. +void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" + "vpsrlw $0x7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x7,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kABGRToY), // %3 + "m"(kAddY16), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_ABGRTOYROW_AVX2 + + #ifdef HAS_ARGBTOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { @@ -1328,6 +1370,69 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0, } #endif // HAS_ARGBTOUVROW_AVX2 +#ifdef HAS_ABGRTOUVROW_AVX2 +void ABGRToUVRow_AVX2(const uint8_t* src_abgr0, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_abgr0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)), // %4 + "m"(kAddUV128), // %5 + "m"(kABGRToV), // %6 + "m"(kABGRToU), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ABGRTOUVROW_AVX2 + #ifdef HAS_ARGBTOUVJROW_AVX2 void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, int src_stride_argb,