diff --git a/README.chromium b/README.chromium index 130a916ae..3d1777bed 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1860 +Version: 1861 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index e18371693..ff6ffe47c 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -402,9 +402,8 @@ extern "C" { // The following are available for AVX512 clang x86 platforms: // TODO(fbarchard): Port to GCC and Visual C // TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789 -// TODO(fbarchard): Port MERGEUV to assembly #if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512) && !defined(_MSC_VER)) + (defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512) #define HAS_ARGBTORGB24ROW_AVX512VBMI #define HAS_MERGEUVROW_AVX512BW #endif diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 3e877f38f..42f816626 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1860 +#define LIBYUV_VERSION 1861 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index 5ba6e5806..37b7091b1 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -919,7 +919,7 @@ int I422ToNV21(const uint8_t* src_y, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow = MergeUVRow_AVX2; } } diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 1b8572a0e..55516cbd8 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -384,7 +384,7 @@ int ARGBToNV12(const uint8_t* src_argb, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_ = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_AVX2; } } @@ -562,7 +562,7 @@ int ARGBToNV21(const uint8_t* src_argb, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_ = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_AVX2; } } @@ -737,7 +737,7 @@ int ABGRToNV12(const uint8_t* src_abgr, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_ = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_AVX2; } } @@ -913,7 +913,7 @@ int ABGRToNV21(const uint8_t* src_abgr, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_ = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_AVX2; } } @@ -2948,7 +2948,7 @@ int RAWToJNV21(const uint8_t* src_raw, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_ = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_AVX2; } } diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 0f89d269d..e3452f58e 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -162,7 +162,7 @@ void Convert8To16Plane(const uint8_t* src_y, int src_stride_y, uint16_t* dst_y, int dst_stride_y, - int scale, // 16384 for 10 bits + int scale, // 1024 for 10 bits int width, int height) { int y; @@ -594,7 +594,7 @@ void MergeUVPlane(const uint8_t* src_u, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { + if (IS_ALIGNED(width, 16)) { MergeUVRow = MergeUVRow_AVX2; } } @@ -736,7 +736,7 @@ void MergeUVPlane_16(const uint16_t* src_u, #if defined(HAS_MERGEUVROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_16 = MergeUVRow_16_Any_AVX2; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 8)) { MergeUVRow_16 = MergeUVRow_16_AVX2; } } diff --git a/source/row_any.cc b/source/row_any.cc index fae4b8a21..0168061ff 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -569,7 +569,7 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON, ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15) #endif #ifdef HAS_MERGEUVROW_AVX2 -ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31) +ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 15) #endif #ifdef HAS_MERGEUVROW_AVX512BW ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31) @@ -861,7 +861,7 @@ ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15) } #ifdef HAS_MERGEUVROW_16_AVX2 -ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 15) +ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 7) #endif #ifdef HAS_MERGEUVROW_16_NEON ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7) diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 79c158a93..aa4c0d11e 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -17,8 +17,6 @@ extern "C" { // This module is for GCC x86 and x64. #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) -#include - #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) // Constants for ARGB @@ -5145,21 +5143,30 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv, #endif // HAS_DETILESPLITUVROW_SSSE3 #ifdef HAS_MERGEUVROW_AVX512BW -__attribute__ ((target("avx512vl,avx512bw"))) void MergeUVRow_AVX512BW(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - do { - const __m512i u = _mm512_cvtepu8_epi16(_mm256_loadu_epi8(src_u)); - const __m512i v = _mm512_slli_epi64(_mm512_cvtepu8_epi16(_mm256_loadu_epi8(src_v)), 8); - const __m512i uv = _mm512_or_si512(u, v); - _mm512_storeu_epi8(dst_uv, uv); - src_u += 32; - src_v += 32; - dst_uv += 64; - width -= 32; - } while (width > 0); + asm volatile("sub %0,%1 \n" + + LABELALIGN + "1: \n" + "vpmovzxbw (%0),%%zmm0 \n" + "vpmovzxbw 0x00(%0,%1,1),%%zmm1 \n" + "lea 0x20(%0),%0 \n" + "vpsllw $0x8,%%zmm1,%%zmm1 \n" + "vporq %%zmm0,%%zmm1,%%zmm2 \n" + "vmovdqu64 %%zmm2,(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEUVROW_AVX512BW @@ -5168,31 +5175,26 @@ void MergeUVRow_AVX2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile( + asm volatile("sub %0,%1 \n" - "sub %0,%1 \n" - - LABELALIGN + LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x00(%0,%1,1),%%ymm1 \n" - "lea 0x20(%0),%0 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" - "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm2,(%2) \n" - "vextractf128 $0x0,%%ymm0,0x10(%2) \n" - "vextractf128 $0x1,%%ymm2,0x20(%2) \n" - "vextractf128 $0x1,%%ymm0,0x30(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x20,%3 \n" + "vpmovzxbw (%0),%%ymm0 \n" + "vpmovzxbw 0x00(%0,%1,1),%%ymm1 \n" + "lea 0x10(%0),%0 \n" + "vpsllw $0x8,%%ymm1,%%ymm1 \n" + "vpor %%ymm0,%%ymm1,%%ymm2 \n" + "vmovdqu %%ymm2,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x10,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEUVROW_AVX2 @@ -5201,11 +5203,9 @@ void MergeUVRow_SSE2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile( + asm volatile("sub %0,%1 \n" - "sub %0,%1 \n" - - LABELALIGN + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%1,1),%%xmm1 \n" @@ -5218,12 +5218,12 @@ void MergeUVRow_SSE2(const uint8_t* src_u, "lea 0x20(%2),%2 \n" "sub $0x10,%3 \n" "jg 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEUVROW_SSE2 @@ -5233,37 +5233,35 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, uint16_t* dst_uv, int depth, int width) { - depth = 16 - depth; // clang-format off asm volatile ( "vmovd %4,%%xmm3 \n" + "vmovd %5,%%xmm4 \n" + + "sub %0,%1 \n" + // 8 pixels per loop. - // 16 pixels per loop. - LABELALIGN + LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu (%0,%1,1),%%ymm1 \n" - "add $0x20,%0 \n" - + "vpmovzxwd (%0),%%ymm0 \n" + "vpmovzxwd 0x00(%0,%1,1),%%ymm1 \n" + "lea 0x10(%0),%0 \n" "vpsllw %%xmm3,%%ymm0,%%ymm0 \n" - "vpsllw %%xmm3,%%ymm1,%%ymm1 \n" - "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates - "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm2,(%2) \n" - "vextractf128 $0x0,%%ymm0,0x10(%2) \n" - "vextractf128 $0x1,%%ymm2,0x20(%2) \n" - "vextractf128 $0x1,%%ymm0,0x30(%2) \n" - "add $0x40,%2 \n" - "sub $0x10,%3 \n" + "vpslld %%xmm4,%%ymm1,%%ymm1 \n" + "vpor %%ymm0,%%ymm1,%%ymm2 \n" + "vmovdqu %%ymm2,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : "r"(depth) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"(16 - depth), // %4 + "r"(32 - depth) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); // clang-format on } #endif // HAS_MERGEUVROW_AVX2 @@ -5469,7 +5467,6 @@ void Convert16To8Row_AVX2(const uint16_t* src_y, // 512 = 9 bits // 1024 = 10 bits // 4096 = 12 bits -// TODO(fbarchard): reduce to SSE2 void Convert8To16Row_SSE2(const uint8_t* src_y, uint16_t* dst_y, int scale, diff --git a/source/row_neon64.cc b/source/row_neon64.cc index df346ee07..7f04b6068 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -820,28 +820,6 @@ void MergeUVRow_NEON(const uint8_t* src_u, : "cc", "memory", "v0", "v1" // Clobber List ); } -// Reads 16 U's and V's and writes out 16 pairs of UV. -void MergeUVRow_NEON1(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - asm volatile( - "1: \n" - "ld1 {v0.16b,v2.16b}, [%0], #32 \n" // load U - "ld1 {v1.16b,v3.16b}, [%1], #32 \n" // load V - "subs %w3, %w3, #32 \n" // 32 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "st2 {v0.16b,v1.16b,v2.16b,v3.16b}, [%2], #64 \n" // store 32 UV - "b.gt 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "v0", "v1" // Clobber List - ); -} void MergeUVRow_16_NEON(const uint16_t* src_u, const uint16_t* src_v, diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 4f462d0a1..ad97b87e2 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -3534,8 +3534,8 @@ TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 16) // TODO(fbarchard): improve test for platforms and cpu detect #ifdef HAS_MERGEUVROW_16_AVX2 TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + // Round count up to multiple of 8 + const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7; align_buffer_page_end(src_pixels_u, kPixels * 2); align_buffer_page_end(src_pixels_v, kPixels * 2);