diff --git a/include/libyuv/rotate_row.h b/include/libyuv/rotate_row.h index a45948094..d4a974c54 100644 --- a/include/libyuv/rotate_row.h +++ b/include/libyuv/rotate_row.h @@ -27,12 +27,10 @@ extern "C" { #define LIBYUV_DISABLE_NEON #endif -// clang >= 19.0.0 required for SME -#if !defined(LIBYUV_DISABLE_SME) && defined(__clang__) && defined(__aarch64__) -#if __clang_major__ < 19 +// temporary disable SME +#if !defined(LIBYUV_DISABLE_SME) #define LIBYUV_DISABLE_SME #endif -#endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 #if defined(__has_feature) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 5c110dd2b..97eabbf67 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -400,10 +400,11 @@ extern "C" { // The following are available for AVX512 clang x86 platforms: // TODO(fbarchard): Port to GCC and Visual C -// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789 +// TODO(b/42280744): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512) #define HAS_ARGBTORGB24ROW_AVX512VBMI +#define HAS_CONVERT16TO8ROW_AVX512BW #define HAS_MERGEUVROW_AVX512BW #endif @@ -3337,6 +3338,10 @@ void Convert16To8Row_AVX2(const uint16_t* src_y, uint8_t* dst_y, int scale, int width); +void Convert16To8Row_AVX512BW(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width); void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr, uint8_t* dst_ptr, int scale, @@ -3345,6 +3350,10 @@ void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr, uint8_t* dst_ptr, int scale, int width); +void Convert16To8Row_Any_AVX512BW(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int scale, + int width); void Convert16To8Row_NEON(const uint16_t* src_y, uint8_t* dst_y, int scale, diff --git a/source/convert.cc b/source/convert.cc index 4ff63f6f9..7d44d8ae4 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -705,6 +705,14 @@ int I010ToNV12(const uint16_t* src_y, } } #endif +#if defined(HAS_CONVERT16TO8ROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + Convert16To8Row = Convert16To8Row_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + Convert16To8Row = Convert16To8Row_AVX512BW; + } + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 6191e4423..6e8801cda 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -149,6 +149,14 @@ void Convert16To8Plane(const uint16_t* src_y, } } #endif +#if defined(HAS_CONVERT16TO8ROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + Convert16To8Row = Convert16To8Row_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + Convert16To8Row = Convert16To8Row_AVX512BW; + } + } +#endif // Convert plane for (y = 0; y < height; ++y) { diff --git a/source/row_any.cc b/source/row_any.cc index 2118ad500..67dc8d2f6 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1684,8 +1684,8 @@ ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7) // Any 1 to 1 with parameter and shorts. BPP measures in shorts. #define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \ - SIMD_ALIGNED(STYPE vin[32]); \ - SIMD_ALIGNED(DTYPE vout[32]); \ + SIMD_ALIGNED(STYPE vin[64]); \ + SIMD_ALIGNED(DTYPE vout[64]); \ memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ @@ -1715,6 +1715,15 @@ ANY11C(Convert16To8Row_Any_AVX2, uint8_t, 31) #endif +#ifdef HAS_CONVERT16TO8ROW_AVX512BW +ANY11C(Convert16To8Row_Any_AVX512BW, + Convert16To8Row_AVX512BW, + 2, + 1, + uint16_t, + uint8_t, + 63) +#endif #ifdef HAS_CONVERT16TO8ROW_NEON ANY11C(Convert16To8Row_Any_NEON, Convert16To8Row_NEON, diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 69babb453..cb757c755 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -5202,8 +5202,7 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y, int width) { asm volatile ( "vmovd %3,%%xmm3 \n" - "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" + "vpbroadcastw %%xmm3,%%ymm3 \n" "sub %0,%1 \n" // 32 pixels per loop. @@ -5239,8 +5238,7 @@ void DivideRow_16_AVX2(const uint16_t* src_y, int width) { asm volatile ( "vmovd %3,%%xmm3 \n" - "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" + "vpbroadcastw %%xmm3,%%ymm3 \n" "sub %0,%1 \n" // 32 pixels per loop. @@ -5306,8 +5304,7 @@ void Convert16To8Row_AVX2(const uint16_t* src_y, int width) { asm volatile ( "vmovd %3,%%xmm2 \n" - "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" - "vbroadcastss %%xmm2,%%ymm2 \n" + "vpbroadcastw %%xmm2,%%ymm2 \n" // 32 pixels per loop. LABELALIGN @@ -5332,6 +5329,38 @@ void Convert16To8Row_AVX2(const uint16_t* src_y, } #endif // HAS_CONVERT16TO8ROW_AVX2 +#ifdef HAS_CONVERT16TO8ROW_AVX512BW +void Convert16To8Row_AVX512BW(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + asm volatile ( + "vpbroadcastw %3,%%zmm2 \n" + + // 64 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu8 (%0),%%zmm0 \n" + "vmovdqu8 0x40(%0),%%zmm1 \n" + "add $0x80,%0 \n" + "vpmulhuw %%zmm2,%%zmm0,%%zmm0 \n" + "vpmulhuw %%zmm2,%%zmm1,%%zmm1 \n" + "vpmovuswb %%zmm0,%%ymm0 \n" + "vpmovuswb %%zmm1,%%ymm1 \n" + "vmovdqu8 %%ymm0,(%1) \n" + "vmovdqu8 %%ymm1,0x20(%1) \n" + "add $0x40,%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_CONVERT16TO8ROW_AVX2 + // Use scale to convert to lsb formats depending how many bits there are: // 512 = 9 bits // 1024 = 10 bits @@ -5374,8 +5403,7 @@ void Convert8To16Row_AVX2(const uint8_t* src_y, int width) { asm volatile ( "vmovd %3,%%xmm2 \n" - "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" - "vbroadcastss %%xmm2,%%ymm2 \n" + "vpbroadcastw %%xmm2,%%ymm2 \n" // 32 pixels per loop. LABELALIGN