diff --git a/include/libyuv/convert_from_argb.h b/include/libyuv/convert_from_argb.h index 50722d761..4d613502a 100644 --- a/include/libyuv/convert_from_argb.h +++ b/include/libyuv/convert_from_argb.h @@ -55,6 +55,15 @@ int ARGBToRGBA(const uint8* src_argb, int width, int height); +// Convert ARGB To AR30. +LIBYUV_API +int ARGBToAR30(const uint8* src_argb, + int src_stride_argb, + uint8* dst_ar30, + int dst_stride_ar30, + int width, + int height); + // Convert ARGB To RGB24. LIBYUV_API int ARGBToRGB24(const uint8* src_argb, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 34d727641..30b6e4c6a 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -277,6 +277,7 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#define HAS_ARGBTOAR30ROW_AVX2 #define HAS_MERGEUVROW_16_AVX2 #define HAS_MULTIPLYROW_16_AVX2 #endif @@ -1791,6 +1792,7 @@ void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToAR30Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int width); @@ -1817,6 +1819,7 @@ void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToAR30Row_C(const uint8* src_argb, uint8* dst_rgb, int width); void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); @@ -2416,6 +2419,9 @@ void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToAR30Row_Any_AVX2(const uint8* src_argb, + uint8* dst_rgb, + int width); void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width); diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 88f38279a..77c542b7a 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -1308,6 +1308,47 @@ int ARGBToARGB4444(const uint8* src_argb, return 0; } +// Convert ARGB To AR30. +LIBYUV_API +int ARGBToAR30(const uint8* src_argb, + int src_stride_argb, + uint8* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + int y; + void (*ARGBToAR30Row)(const uint8* src_argb, uint8* dst_rgb, int width) = + ARGBToAR30Row_C; + if (!src_argb || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_ar30 == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_ar30 = 0; + } +#if defined(HAS_ARGBTOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToAR30Row = ARGBToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToAR30Row = ARGBToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + ARGBToAR30Row(src_argb, dst_ar30, width); + src_argb += src_stride_argb; + dst_ar30 += dst_stride_ar30; + } + return 0; +} + // Convert ARGB to J420. (JPeg full range I420). LIBYUV_API int ARGBToJ420(const uint8* src_argb, diff --git a/source/row_any.cc b/source/row_any.cc index 8b31ac9fc..4f1877656 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -396,6 +396,9 @@ ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) #endif +#if defined(HAS_ARGBTOAR30ROW_AVX2) +ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7) +#endif #if defined(HAS_J400TOARGBROW_SSE2) ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7) #endif diff --git a/source/row_common.cc b/source/row_common.cc index 6ffc4febb..5dfd57aed 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -301,6 +301,19 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { } } +void ARGBToAR30Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32 b0 = (src_argb[0] >> 6) | (src_argb[0] << 2); + uint32 g0 = (src_argb[1] >> 6) | (src_argb[1] << 2); + uint32 r0 = (src_argb[2] >> 6) | (src_argb[2] << 2); + uint32 a0 = (src_argb[3] >> 6); + *(uint32*)(dst_rgb) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30); + dst_rgb += 4; + src_argb += 4; + } +} + static __inline int RGBToY(uint8 r, uint8 g, uint8 b) { return (66 * r + 129 * g + 25 * b + 0x1080) >> 8; } diff --git a/source/row_gcc.cc b/source/row_gcc.cc index b5c2e65c9..f348b7edc 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -700,6 +700,57 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) { } #endif // HAS_RGB24TOARGBROW_SSSE3 +#ifdef HAS_ARGBTOAR30ROW_AVX2 +void ARGBToAR30Row_AVX2(const uint8* src, uint8* dst, int width) { + asm volatile ( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0x000000ff mask + "vpsrld $0x18,%%ymm4,%%ymm4 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // 0xc0000000 mask + "vpslld $30,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + // alpha + "vpand %%ymm5,%%ymm0,%%ymm3 \n" + // red + "vpsrld $0x10,%%ymm0,%%ymm1 \n" + "vpand %%ymm4,%%ymm1,%%ymm1 \n" + "vpsrld $0x6,%%ymm1,%%ymm2 \n" + "vpslld $22,%%ymm1,%%ymm1 \n" + "vpslld $20,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm3,%%ymm3 \n" + "vpor %%ymm2,%%ymm3,%%ymm3 \n" + //green + "vpsrld $0x08,%%ymm0,%%ymm1 \n" + "vpand %%ymm4,%%ymm1,%%ymm1 \n" + "vpsrld $0x6,%%ymm1,%%ymm2 \n" + "vpslld $12,%%ymm1,%%ymm1 \n" + "vpslld $10,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm3,%%ymm3 \n" + "vpor %%ymm2,%%ymm3,%%ymm3 \n" + //blue + "vpand %%ymm4,%%ymm0,%%ymm1 \n" + "vpsrld $0x6,%%ymm1,%%ymm2 \n" + "vpslld $2,%%ymm1,%%ymm1 \n" + "vpor %%ymm1,%%ymm3,%%ymm3 \n" + "vpor %%ymm2,%%ymm3,%%ymm3 \n" + + "vmovdqu %%ymm3,(%1) \n" + "add $0x20,%0 \n" + "add $0x20,%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + :: "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif + #ifdef HAS_ARGBTOYROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 56b6364e5..ead5919c3 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -11,6 +11,8 @@ #include #include +#include "libyuv/row.h" /* For ARGBToAR30Row_AVX2 */ + #include "libyuv/basic_types.h" #include "libyuv/compare.h" #include "libyuv/convert.h" @@ -1069,6 +1071,7 @@ TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0) TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0) TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0) TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0) +TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1, 0) TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4) TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4) TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2) @@ -1928,4 +1931,36 @@ TEST_F(LibYUVConvertTest, RotateWithARGBSource) { EXPECT_EQ(dst[3], src[1]); } +#ifdef HAS_ARGBTOAR30ROW_AVX2 +TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) { + const int kPixels = benchmark_width_ * benchmark_height_; + align_buffer_page_end(src, kPixels * 4); + align_buffer_page_end(dst_opt, kPixels * 4); + align_buffer_page_end(dst_c, kPixels * 4); + + MemRandomize(src, kPixels * 4); + memset(dst_opt, 0, kPixels * 4); + memset(dst_c, 1, kPixels * 4); + + ARGBToAR30Row_C(src, dst_c, kPixels); + + int has_avx2 = TestCpuFlag(kCpuHasAVX2); + for (int i = 0; i < benchmark_iterations_; ++i) { + if (has_avx2) { + ARGBToAR30Row_AVX2(src, dst_opt, kPixels); + } else { + ARGBToAR30Row_C(src, dst_opt, kPixels); + } + } + + for (int i = 0; i < kPixels * 4; ++i) { + EXPECT_EQ(dst_opt[i], dst_c[i]); + } + + free_aligned_buffer_page_end(src); + free_aligned_buffer_page_end(dst_opt); + free_aligned_buffer_page_end(dst_c); +} +#endif // HAS_ARGBTOAR30ROW_AVX2 + } // namespace libyuv