diff --git a/README.chromium b/README.chromium index 1407f963e..8baf8d277 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1934 +Version: 1935 Revision: DEPS License: BSD-3-Clause License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 1efb6ccc8..3072d8ff9 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -50,7 +50,6 @@ extern "C" { #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 #define HAS_H422TOARGBROW_SSSE3 -#define HAS_HALFFLOATROW_SSE2 #define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB4444ROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3 @@ -6863,14 +6862,6 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, // Scale and convert to half float. void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width); -void HalfFloatRow_SSE2(const uint16_t* src, - uint16_t* dst, - float scale, - int width); -void HalfFloatRow_Any_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - float param, - int width); void HalfFloatRow_AVX2(const uint16_t* src, uint16_t* dst, float scale, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index b745710eb..0301fb191 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1934 +#define LIBYUV_VERSION 1935 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index fde3717a4..1a6a445b8 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -5077,14 +5077,6 @@ int HalfFloatPlane(const uint16_t* src_y, height = 1; src_stride_y = dst_stride_y = 0; } -#if defined(HAS_HALFFLOATROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - HalfFloatRow = HalfFloatRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - HalfFloatRow = HalfFloatRow_SSE2; - } - } -#endif #if defined(HAS_HALFFLOATROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { HalfFloatRow = HalfFloatRow_Any_AVX2; diff --git a/source/row_any.cc b/source/row_any.cc index 81e0f44fb..4ae858560 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1748,9 +1748,6 @@ ANY11SB(Convert8To8Row_Any_AVX2, memcpy(dst_ptr + np, vout, r * BPP); \ } -#ifdef HAS_HALFFLOATROW_SSE2 -ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7) -#endif #ifdef HAS_HALFFLOATROW_AVX2 ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15) #endif diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 0da6e2ada..767dc8605 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -224,8 +224,8 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 0xff000000 "vpslld $0x18,%%ymm6,%%ymm6 \n" - "vbroadcastf128 %3,%%ymm4 \n" // - "vbroadcastf128 %4,%%ymm5 \n" // + "vbroadcasti128 %3,%%ymm4 \n" // + "vbroadcasti128 %4,%%ymm5 \n" // LABELALIGN // "1: \n" @@ -609,7 +609,7 @@ static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7}; void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "vbroadcastf128 %3,%%ymm6 \n" + "vbroadcasti128 %3,%%ymm6 \n" "vmovdqa %4,%%ymm7 \n" LABELALIGN @@ -703,7 +703,7 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ARGBTORAWROW_AVX2 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "vbroadcastf128 %3,%%ymm6 \n" + "vbroadcasti128 %3,%%ymm6 \n" "vmovdqa %4,%%ymm7 \n" LABELALIGN @@ -1061,7 +1061,7 @@ void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ARGBTOAR30ROW_AVX2 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB + "vbroadcasti128 %3,%%ymm2 \n" // shuffler for RB "vbroadcastss %4,%%ymm3 \n" // multipler for RB "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 "vbroadcastss %6,%%ymm5 \n" // mask for AG @@ -1098,7 +1098,7 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ABGRTOAR30ROW_AVX2 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB + "vbroadcasti128 %3,%%ymm2 \n" // shuffler for RB "vbroadcastss %4,%%ymm3 \n" // multipler for RB "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 "vbroadcastss %6,%%ymm5 \n" // mask for AG @@ -1264,8 +1264,8 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { asm volatile( - "vbroadcastf128 %3,%%ymm2 \n" - "vbroadcastf128 %4,%%ymm3 \n" LABELALIGN + "vbroadcasti128 %3,%%ymm2 \n" + "vbroadcasti128 %4,%%ymm3 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" @@ -1317,7 +1317,7 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - asm volatile("vbroadcastf128 %3,%%ymm2 \n" LABELALIGN + asm volatile("vbroadcasti128 %3,%%ymm2 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -1532,8 +1532,8 @@ void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb, "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsllw $15,%%ymm5,%%ymm5 \n" "vpacksswb %%ymm5,%%ymm5,%%ymm5 \n" - "vbroadcastf128 0(%3),%%ymm4 \n" - "vbroadcastf128 0x60(%3),%%ymm7 \n" + "vbroadcasti128 0(%3),%%ymm4 \n" + "vbroadcasti128 0x60(%3),%%ymm7 \n" "vpmaddubsw %%ymm5,%%ymm4,%%ymm6 \n" "vphaddw %%ymm6,%%ymm6,%%ymm6 \n" "vpsubw %%ymm6,%%ymm7,%%ymm7 \n" @@ -1933,8 +1933,8 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, int width, const struct ArgbConstants* c) { asm volatile( - "vbroadcastf128 0x20(%5),%%ymm4 \n" // RGBToU - "vbroadcastf128 0x40(%5),%%ymm5 \n" // RGBToV + "vbroadcasti128 0x20(%5),%%ymm4 \n" // RGBToU + "vbroadcasti128 0x40(%5),%%ymm5 \n" // RGBToV "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 0x0101 "vpabsb %%ymm6,%%ymm6 \n" "vmovdqa %6,%%ymm7 \n" // kShuffleAARRGGBB @@ -3421,7 +3421,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpunpcklbw %%zmm1,%%zmm3,%%zmm3 \n" \ "vpermq $0xd8,%%zmm3,%%zmm3 \n" \ "vpunpcklwd %%zmm3,%%zmm3,%%zmm3 \n" \ - "vmovups (%[y_buf]),%%ymm4 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ "vpermq %%zmm4,%%zmm17,%%zmm4 \n" \ "vpermq $0xd8,%%zmm4,%%zmm4 \n" \ "vpunpcklbw %%zmm4,%%zmm4,%%zmm4 \n" \ @@ -3614,32 +3614,26 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpshufb %%ymm7,%%ymm1,%%ymm3 \n" \ "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n" -// TODO(fbarchard): Remove broadcastb #if defined(__x86_64__) #define YUVTORGB_SETUP_AVX2(yuvconstants) \ - "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \ + "vpcmpeqb %%ymm13,%%ymm13,%%ymm13 \n" \ "vmovdqa (%[yuvconstants]),%%ymm8 \n" \ - "vpsllw $7,%%xmm13,%%xmm13 \n" \ + "vpabsb %%ymm13,%%ymm13 \n" \ "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \ - "vpbroadcastb %%xmm13,%%ymm13 \n" \ + "vpsllw $7,%%ymm13,%%ymm13 \n" \ "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \ "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \ "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" #define YUVTORGB_SETUP_AVX512BW(yuvconstants) \ - "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \ - "movdqa (%[yuvconstants]),%%xmm8 \n" \ - "vpbroadcastq %%xmm8, %%zmm8 \n" \ - "vpsllw $7,%%xmm13,%%xmm13 \n" \ - "vpbroadcastb %%xmm13,%%zmm13 \n" \ - "movq 32(%[yuvconstants]),%%xmm9 \n" \ - "vpbroadcastq %%xmm9,%%zmm9 \n" \ - "movq 64(%[yuvconstants]),%%xmm10 \n" \ - "vpbroadcastq %%xmm10,%%zmm10 \n" \ - "movq 96(%[yuvconstants]),%%xmm11 \n" \ - "vpbroadcastq %%xmm11,%%zmm11 \n" \ - "movq 128(%[yuvconstants]),%%xmm12 \n" \ - "vpbroadcastq %%xmm12,%%zmm12 \n" \ + "vpternlogd $0xff,%%zmm13,%%zmm13,%%zmm13 \n" \ + "vpbroadcastq (%[yuvconstants]),%%zmm8 \n" \ + "vpabsb %%zmm13,%%zmm13 \n" \ + "vpsllw $7,%%zmm13,%%zmm13 \n" \ + "vpbroadcastq 32(%[yuvconstants]),%%zmm9 \n" \ + "vpbroadcastq 64(%[yuvconstants]),%%zmm10 \n" \ + "vpbroadcastq 96(%[yuvconstants]),%%zmm11 \n" \ + "vpbroadcastq 128(%[yuvconstants]),%%zmm12 \n" \ "vmovups (%[quadsplitperm]),%%zmm16 \n" \ "vmovups (%[dquadsplitperm]),%%zmm17 \n" \ "vmovups (%[unperm]),%%zmm18 \n" @@ -4384,8 +4378,8 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile( - "vbroadcastf128 %[kShuffleYUY2Y],%%ymm6 \n" - "vbroadcastf128 %[kShuffleYUY2UV],%%ymm7 \n" YUVTORGB_SETUP_AVX2( + "vbroadcasti128 %[kShuffleYUY2Y],%%ymm6 \n" + "vbroadcasti128 %[kShuffleYUY2UV],%%ymm7 \n" YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READYUY2_AVX2 YUVTORGB_AVX2(yuvconstants) @@ -4411,8 +4405,8 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile( - "vbroadcastf128 %[kShuffleUYVYY],%%ymm6 \n" - "vbroadcastf128 %[kShuffleUYVYUV],%%ymm7 \n" YUVTORGB_SETUP_AVX2( + "vbroadcasti128 %[kShuffleUYVYY],%%ymm6 \n" + "vbroadcasti128 %[kShuffleUYVYUV],%%ymm7 \n" YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READUYVY_AVX2 YUVTORGB_AVX2(yuvconstants) @@ -4671,7 +4665,7 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_MIRRORROW_AVX2 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { ptrdiff_t temp_width = (ptrdiff_t)(width); - asm volatile("vbroadcastf128 %3,%%ymm5 \n" + asm volatile("vbroadcasti128 %3,%%ymm5 \n" LABELALIGN "1: \n" @@ -4719,7 +4713,7 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { #ifdef HAS_MIRRORUVROW_AVX2 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { ptrdiff_t temp_width = (ptrdiff_t)(width); - asm volatile("vbroadcastf128 %3,%%ymm5 \n" + asm volatile("vbroadcasti128 %3,%%ymm5 \n" LABELALIGN "1: \n" @@ -5207,7 +5201,7 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv, depth = 16 - depth; asm volatile( "vmovd %4,%%xmm3 \n" - "vbroadcastf128 %5,%%ymm4 \n" + "vbroadcasti128 %5,%%ymm4 \n" "sub %1,%2 \n" // 16 pixels per loop. @@ -6109,7 +6103,7 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb, "sub %1,%3 \n" "sub %1,%4 \n" "vmovdqa %7,%%ymm3 \n" - "vbroadcastf128 %6,%%ymm4 \n" + "vbroadcasti128 %6,%%ymm4 \n" LABELALIGN "1: \n" @@ -6157,7 +6151,7 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb, int width) { asm volatile( "vmovdqa %6,%%ymm3 \n" - "vbroadcastf128 %5,%%ymm4 \n" + "vbroadcasti128 %5,%%ymm4 \n" LABELALIGN "1: \n" @@ -6406,7 +6400,7 @@ void MergeARGB16To8Row_AVX2(const uint16_t* src_r, "sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" - "vbroadcastf128 %7,%%ymm5 \n" + "vbroadcasti128 %7,%%ymm5 \n" "vmovd %6,%%xmm6 \n" LABELALIGN @@ -6461,7 +6455,7 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" - "vbroadcastf128 %6,%%ymm5 \n" + "vbroadcasti128 %6,%%ymm5 \n" "vmovd %5,%%xmm6 \n" "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" "vpsrlw $8,%%ymm3,%%ymm3 \n" // A (0xff) @@ -6694,7 +6688,7 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, int width) { asm volatile( "vmovdqa %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" + "vbroadcasti128 %4,%%ymm5 \n" LABELALIGN "1: \n" @@ -7640,7 +7634,7 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, uintptr_t alpha; asm volatile( "sub %0,%1 \n" - "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcasti128 %5,%%ymm5 \n" // 8 pixel loop. LABELALIGN @@ -8887,7 +8881,7 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { - asm volatile("vbroadcastf128 (%3),%%ymm5 \n" + asm volatile("vbroadcasti128 (%3),%%ymm5 \n" LABELALIGN "1: \n" @@ -9113,10 +9107,10 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, const float* poly, int width) { asm volatile( - "vbroadcastf128 (%3),%%ymm4 \n" - "vbroadcastf128 0x10(%3),%%ymm5 \n" - "vbroadcastf128 0x20(%3),%%ymm6 \n" - "vbroadcastf128 0x30(%3),%%ymm7 \n" + "vbroadcasti128 (%3),%%ymm4 \n" + "vbroadcasti128 0x10(%3),%%ymm5 \n" + "vbroadcasti128 0x20(%3),%%ymm6 \n" + "vbroadcasti128 0x30(%3),%%ymm7 \n" // 2 pixel loop. LABELALIGN @@ -9148,46 +9142,8 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBPOLYNOMIALROW_AVX2 -#ifdef HAS_HALFFLOATROW_SSE2 -static float kScaleBias = 1.9259299444e-34f; -void HalfFloatRow_SSE2(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - scale *= kScaleBias; - asm volatile( - "movd %3,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - "sub %0,%1 \n" - - // 16 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm2 \n" // 8 shorts - "add $0x10,%0 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1 - "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats - "punpckhwd %%xmm5,%%xmm3 \n" - "cvtdq2ps %%xmm3,%%xmm3 \n" - "mulps %%xmm4,%%xmm2 \n" - "mulps %%xmm4,%%xmm3 \n" - "psrld $0xd,%%xmm2 \n" - "psrld $0xd,%%xmm3 \n" - "packssdw %%xmm3,%%xmm2 \n" - "movdqu %%xmm2,-0x10(%0,%1,1) \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(scale) // %3 - : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_HALFFLOATROW_SSE2 - #ifdef HAS_HALFFLOATROW_AVX2 +static float kScaleBias = 1.9259299444e-34f; void HalfFloatRow_AVX2(const uint16_t* src, uint16_t* dst, float scale, @@ -9510,9 +9466,9 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y, int width) { asm volatile( "sub %0,%1 \n" - "vbroadcastf128 (%4),%%ymm4 \n" // 3 shuffler constants - "vbroadcastf128 16(%4),%%ymm5 \n" - "vbroadcastf128 32(%4),%%ymm6 \n" + "vbroadcasti128 (%4),%%ymm4 \n" // 3 shuffler constants + "vbroadcasti128 16(%4),%%ymm5 \n" + "vbroadcasti128 32(%4),%%ymm6 \n" "1: \n" "vmovdqu (%0),%%ymm2 \n" // load 32 Y values @@ -9619,7 +9575,7 @@ void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { #ifdef HAS_SWAPUVROW_AVX2 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile("vbroadcastf128 %3,%%ymm5 \n" + asm volatile("vbroadcasti128 %3,%%ymm5 \n" LABELALIGN "1: \n"