Optimize yuv alpha blend AVX2 code to do 32 pixels at time.

out/Release/libyuv_unittest --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=9999 --libyuv_flags=-1 --gtest_filter=*I420Blend_Opt Was LibYUVPlanarTest.I420Blend_Opt (2335 ms) Now LibYUVPlanarTest.I420Blend_Opt (1937 ms) vs SSSE3 LibYUVPlanarTest.I420Blend_Opt (2599 ms) BUG=libyuv:527 R=dhrosa@google.com Review URL: https://codereview.chromium.org/1505673003 .
2026-02-08 18:56:43 +08:00 · 2015-12-08 18:20:30 -08:00 · 2015-12-08 18:20:30 -08:00 · dee77a4ebe
commit dee77a4ebe
parent fae1a10545
4 changed files with 58 additions and 39 deletions
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@ -10,7 +10,7 @@
 #include "libyuv/cpu_id.h"
-#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
+#if defined(_MSC_VER) && !defined(__clang__)
 #include <intrin.h>  // For __cpuidex()
 #endif
 #if !defined(__pnacl__) && !defined(__CLR_VER) && \
@ -207,8 +207,8 @@ int InitCpuFlags(void) {
 #ifdef HAS_XGETBV
  // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv
-  if ((cpu_info1[2] & 0x1c000000) == 0x1c000000 &&  // AVX and OSXSave
+  if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) &&  // AVX and OSXSave
-      (GetXCR0() & 6) == 6) {  // Test OD saves YMM registers
+      ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers
    cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;
    // Detect AVX512bw
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -620,7 +620,7 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0,
 #if defined(HAS_BLENDPLANEROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
 //  BlendPlaneRow = BlendPlaneRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 32)) {
      BlendPlaneRow = BlendPlaneRow_AVX2;
    }
  }
@ -688,7 +688,7 @@ int I420Blend(const uint8* src_y0, int src_stride_y0,
 #if defined(HAS_BLENDPLANEROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
 //  BlendPlaneRow = BlendPlaneRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 16)) {
+    if (IS_ALIGNED(halfwidth, 32)) {
      BlendPlaneRow = BlendPlaneRow_AVX2;
    }
  }
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -3469,7 +3469,10 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 #ifdef HAS_BLENDPLANEROW_SSSE3
 // Blend 8 pixels at a time.
-// =((G2*C2)+(H2*(D2))+32768+127)/256
+// unsigned version of math
 // =((A2*C2)+(B2*(255-C2))+255)/256
 // signed version of math
 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
                         const uint8* alpha, uint8* dst, int width) {
  asm volatile (
@ -3514,8 +3517,11 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
 #endif  // HAS_BLENDPLANEROW_SSSE3
 #ifdef HAS_BLENDPLANEROW_AVX2
-// Blend 16 pixels at a time.
+// Blend 32 pixels at a time.
-// =((G2*C2)+(H2*(D2))+32768+127)/256
+// unsigned version of math
 // =((A2*C2)+(B2*(255-C2))+255)/256
 // signed version of math
 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
                        const uint8* alpha, uint8* dst, int width) {
  asm volatile (
@ -3531,27 +3537,30 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
    "sub        %2,%1                          \n"
    "sub        %2,%3                          \n"
-    // 16 pixel loop.
+    // 32 pixel loop.
    LABELALIGN
  "1:                                          \n"
-    "vmovdqu    (%2),%%xmm0                    \n"
+    "vmovdqu    (%2),%%ymm0                    \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
    "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
    "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
-    "vmovdqu    (%0,%2,1),%%xmm1               \n"
+    "vmovdqu    (%0,%2,1),%%ymm1               \n"
-    "vmovdqu    (%1,%2,1),%%xmm2               \n"
+    "vmovdqu    (%1,%2,1),%%ymm2               \n"
-    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
    "vpsubb     %%ymm6,%%ymm1,%%ymm4           \n"
    "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
    "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
    "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
    "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
    "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%ymm0,(%3,%2,1)               \n"
-    "vmovdqu    %%xmm0,(%3,%2,1)               \n"
+    "lea        0x20(%2),%2                    \n"
-    "lea        0x10(%2),%2                    \n"
+    "sub        $0x20,%4                       \n"
    "sub        $0x10,%4                       \n"
    "jg        1b                              \n"
    "vzeroupper                                \n"
  : "+r"(src0),       // %0
@ -3559,7 +3568,8 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
    "+r"(alpha),      // %2
    "+r"(dst),        // %3
    "+r"(width)       // %4
-  :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"
+  :: "memory", "cc", "eax",
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
 }
 #endif  // HAS_BLENDPLANEROW_AVX2
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -4065,7 +4065,10 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
 #ifdef HAS_BLENDPLANEROW_SSSE3
 // Blend 8 pixels at a time.
-// =((G2*C2)+(H2*(D2))+32768+127)/256
+// unsigned version of math
 // =((A2*C2)+(B2*(255-C2))+255)/256
 // signed version of math
 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
 __declspec(naked)
 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
                         const uint8* alpha, uint8* dst, int width) {
@ -4116,8 +4119,11 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
 #endif  // HAS_BLENDPLANEROW_SSSE3
 #ifdef HAS_BLENDPLANEROW_AVX2
-// Blend 16 pixels at a time.
+// Blend 32 pixels at a time.
-// =((G2*C2)+(H2*(D2))+32768+127)/256
+// unsigned version of math
 // =((A2*C2)+(B2*(255-C2))+255)/256
 // signed version of math
 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
 __declspec(naked)
 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
                         const uint8* alpha, uint8* dst, int width) {
@ -4141,27 +4147,30 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
    sub         edx, esi
    sub         edi, esi
-    // 16 pixel loop.
+    // 32 pixel loop.
-  convertloop16:
+  convertloop32:
-    vmovdqu     xmm0, [esi]        // alpha
+    vmovdqu     ymm0, [esi]        // alpha
-    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm3, ymm0, ymm0   // 8..15, 24..31
-    vpunpcklbw  ymm0, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0   // 0..7, 16..23
    vpxor       ymm3, ymm3, ymm5   // a, 255-a
    vpxor       ymm0, ymm0, ymm5   // a, 255-a
-    vmovdqu     xmm1, [eax + esi]  // src0
+    vmovdqu     ymm1, [eax + esi]  // src0
-    vmovdqu     xmm2, [edx + esi]  // src1
+    vmovdqu     ymm2, [edx + esi]  // src1
-    vpermq      ymm1, ymm1, 0xd8
+    vpunpckhbw  ymm4, ymm1, ymm2
    vpermq      ymm2, ymm2, 0xd8
    vpunpcklbw  ymm1, ymm1, ymm2
    vpsubb      ymm4, ymm4, ymm6   // bias src0/1 - 128
    vpsubb      ymm1, ymm1, ymm6   // bias src0/1 - 128
    vpmaddubsw  ymm3, ymm3, ymm4
    vpmaddubsw  ymm0, ymm0, ymm1
    vpaddw      ymm3, ymm3, ymm7   // unbias result - 32768 and round.
    vpaddw      ymm0, ymm0, ymm7   // unbias result - 32768 and round.
    vpsrlw      ymm3, ymm3, 8
    vpsrlw      ymm0, ymm0, 8
-    vpackuswb   ymm0, ymm0, ymm0
+    vpackuswb   ymm0, ymm0, ymm3
-    vpermq      ymm0, ymm0, 0xd8
+    vmovdqu     [edi + esi], ymm0
-    vmovdqu     [edi + esi], xmm0
+    lea         esi, [esi + 32]
-    lea         esi, [esi + 16]
+    sub         ecx, 32
-    sub         ecx, 16
+    jg          convertloop32
    jg          convertloop16
    pop         edi
    pop         esi