From eec8dd37e827a78c3bdbb66da6caad89f4b8c4dd Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Fri, 15 Apr 2022 11:21:25 -0700
Subject: [PATCH] Change ScaleUVRowUp2_Biinear_16_SSE2 to SSE41

Bug: libyuv:928

xed -i scale_gcc.o:
SYM ScaleUVRowUp2_Linear_16_SSE2:
XDIS 0: LOGICAL   SSE2       660FEFED                 pxor xmm5, xmm5
XDIS 4: SSE       SSE2       660F76E4                 pcmpeqd xmm4, xmm4
XDIS 8: SSE       SSE2       660F72D41F               psrld xmm4, 0x1f
XDIS d: SSE       SSE2       660F72F401               pslld xmm4, 0x1
XDIS 12: DATAXFER  SSE2       F30F7E07                 movq xmm0, qword ptr [rdi]
XDIS 16: DATAXFER  SSE2       F30F7E4F04               movq xmm1, qword ptr [rdi+0x4]
XDIS 1b: SSE       SSE2       660F61C5                 punpcklwd xmm0, xmm5
XDIS 1f: SSE       SSE2       660F61CD                 punpcklwd xmm1, xmm5
XDIS 23: DATAXFER  SSE2       660F6FD0                 movdqa xmm2, xmm0
XDIS 27: DATAXFER  SSE2       660F6FD9                 movdqa xmm3, xmm1
XDIS 2b: SSE       SSE2       660F70D24E               pshufd xmm2, xmm2, 0x4e
XDIS 30: SSE       SSE2       660F70DB4E               pshufd xmm3, xmm3, 0x4e
XDIS 35: SSE       SSE2       660FFED4                 paddd xmm2, xmm4
XDIS 39: SSE       SSE2       660FFEDC                 paddd xmm3, xmm4
XDIS 3d: SSE       SSE2       660FFED0                 paddd xmm2, xmm0
XDIS 41: SSE       SSE2       660FFED9                 paddd xmm3, xmm1
XDIS 45: SSE       SSE2       660FFEC0                 paddd xmm0, xmm0
XDIS 49: SSE       SSE2       660FFEC9                 paddd xmm1, xmm1
XDIS 4d: SSE       SSE2       660FFEC2                 paddd xmm0, xmm2
XDIS 51: SSE       SSE2       660FFECB                 paddd xmm1, xmm3
XDIS 55: SSE       SSE2       660F72D002               psrld xmm0, 0x2
XDIS 5a: SSE       SSE2       660F72D102               psrld xmm1, 0x2
XDIS 5f: SSE       SSE4       660F382BC1               packusdw xmm0, xmm1
XDIS 64: DATAXFER  SSE2       F30F7F06                 movdqu xmmword ptr [rsi], xmm0
XDIS 68: MISC      BASE       488D7F08                 lea rdi, ptr [rdi+0x8]
XDIS 6c: MISC      BASE       488D7610                 lea rsi, ptr [rsi+0x10]
XDIS 70: BINARY    BASE       83EA04                   sub edx, 0x4
XDIS 73: COND_BR   BASE       7F9D                     jnle 0x12 <ScaleUVRowUp2_Linear_16_SSE2+0x12>
XDIS 75: RET       BASE       C3                       ret

SYM ScaleUVRowUp2_Bilinear_16_SSE2:
XDIS 0: LOGICAL   SSE2       660FEFFF                 pxor xmm7, xmm7
XDIS 4: SSE       SSE2       660F76F6                 pcmpeqd xmm6, xmm6
XDIS 8: SSE       SSE2       660F72D61F               psrld xmm6, 0x1f
XDIS d: SSE       SSE2       660F72F603               pslld xmm6, 0x3
XDIS 12: DATAXFER  SSE2       F30F7E07                 movq xmm0, qword ptr [rdi]
XDIS 16: DATAXFER  SSE2       F30F7E4F04               movq xmm1, qword ptr [rdi+0x4]
XDIS 1b: SSE       SSE2       660F61C7                 punpcklwd xmm0, xmm7
XDIS 1f: SSE       SSE2       660F61CF                 punpcklwd xmm1, xmm7
XDIS 23: DATAXFER  SSE2       660F6FD0                 movdqa xmm2, xmm0
XDIS 27: DATAXFER  SSE2       660F6FD9                 movdqa xmm3, xmm1
XDIS 2b: SSE       SSE2       660F70D24E               pshufd xmm2, xmm2, 0x4e
XDIS 30: SSE       SSE2       660F70DB4E               pshufd xmm3, xmm3, 0x4e
XDIS 35: SSE       SSE2       660FFED0                 paddd xmm2, xmm0
XDIS 39: SSE       SSE2       660FFED9                 paddd xmm3, xmm1
XDIS 3d: SSE       SSE2       660FFEC0                 paddd xmm0, xmm0
XDIS 41: SSE       SSE2       660FFEC9                 paddd xmm1, xmm1
XDIS 45: SSE       SSE2       660FFEC2                 paddd xmm0, xmm2
XDIS 49: SSE       SSE2       660FFECB                 paddd xmm1, xmm3
XDIS 4d: DATAXFER  SSE2       F30F7E1477               movq xmm2, qword ptr [rdi+rsi*2]
XDIS 52: DATAXFER  SSE2       F30F7E5C7704             movq xmm3, qword ptr [rdi+rsi*2+0x4]
XDIS 58: SSE       SSE2       660F61D7                 punpcklwd xmm2, xmm7
XDIS 5c: SSE       SSE2       660F61DF                 punpcklwd xmm3, xmm7
XDIS 60: DATAXFER  SSE2       660F6FE2                 movdqa xmm4, xmm2
XDIS 64: DATAXFER  SSE2       660F6FEB                 movdqa xmm5, xmm3
XDIS 68: SSE       SSE2       660F70E44E               pshufd xmm4, xmm4, 0x4e
XDIS 6d: SSE       SSE2       660F70ED4E               pshufd xmm5, xmm5, 0x4e
XDIS 72: SSE       SSE2       660FFEE2                 paddd xmm4, xmm2
XDIS 76: SSE       SSE2       660FFEEB                 paddd xmm5, xmm3
XDIS 7a: SSE       SSE2       660FFED2                 paddd xmm2, xmm2
XDIS 7e: SSE       SSE2       660FFEDB                 paddd xmm3, xmm3
XDIS 82: SSE       SSE2       660FFED4                 paddd xmm2, xmm4
XDIS 86: SSE       SSE2       660FFEDD                 paddd xmm3, xmm5
XDIS 8a: DATAXFER  SSE2       660F6FE0                 movdqa xmm4, xmm0
XDIS 8e: DATAXFER  SSE2       660F6FEA                 movdqa xmm5, xmm2
XDIS 92: SSE       SSE2       660FFEE0                 paddd xmm4, xmm0
XDIS 96: SSE       SSE2       660FFEEE                 paddd xmm5, xmm6
XDIS 9a: SSE       SSE2       660FFEE0                 paddd xmm4, xmm0
XDIS 9e: SSE       SSE2       660FFEE5                 paddd xmm4, xmm5
XDIS a2: SSE       SSE2       660F72D404               psrld xmm4, 0x4
XDIS a7: DATAXFER  SSE2       660F6FEA                 movdqa xmm5, xmm2
XDIS ab: SSE       SSE2       660FFEEA                 paddd xmm5, xmm2
XDIS af: SSE       SSE2       660FFEC6                 paddd xmm0, xmm6
XDIS b3: SSE       SSE2       660FFEEA                 paddd xmm5, xmm2
XDIS b7: SSE       SSE2       660FFEE8                 paddd xmm5, xmm0
XDIS bb: SSE       SSE2       660F72D504               psrld xmm5, 0x4
XDIS c0: DATAXFER  SSE2       660F6FC1                 movdqa xmm0, xmm1
XDIS c4: DATAXFER  SSE2       660F6FD3                 movdqa xmm2, xmm3
XDIS c8: SSE       SSE2       660FFEC1                 paddd xmm0, xmm1
XDIS cc: SSE       SSE2       660FFED6                 paddd xmm2, xmm6
XDIS d0: SSE       SSE2       660FFEC1                 paddd xmm0, xmm1
XDIS d4: SSE       SSE2       660FFEC2                 paddd xmm0, xmm2
XDIS d8: SSE       SSE2       660F72D004               psrld xmm0, 0x4
XDIS dd: DATAXFER  SSE2       660F6FD3                 movdqa xmm2, xmm3
XDIS e1: SSE       SSE2       660FFED3                 paddd xmm2, xmm3
XDIS e5: SSE       SSE2       660FFECE                 paddd xmm1, xmm6
XDIS e9: SSE       SSE2       660FFED3                 paddd xmm2, xmm3
XDIS ed: SSE       SSE2       660FFED1                 paddd xmm2, xmm1
XDIS f1: SSE       SSE2       660F72D204               psrld xmm2, 0x4
XDIS f6: SSE       SSE4       660F382BE0               packusdw xmm4, xmm0
XDIS fb: DATAXFER  SSE2       F30F7F22                 movdqu xmmword ptr [rdx], xmm4
XDIS ff: SSE       SSE4       660F382BEA               packusdw xmm5, xmm2
XDIS 104: DATAXFER  SSE2       F30F7F2C4A               movdqu xmmword ptr [rdx+rcx*2], xmm5
XDIS 109: MISC      BASE       488D7F08                 lea rdi, ptr [rdi+0x8]
XDIS 10d: MISC      BASE       488D5210                 lea rdx, ptr [rdx+0x10]
XDIS 111: BINARY    BASE       4183E804                 sub r8d, 0x4
XDIS 115: COND_BR   BASE       0F8FF7FEFFFF             jnle 0x12 <ScaleUVRowUp2_Bilinear_16_SSE2+0x12>
XDIS 11b: RET       BASE       C3                       ret

Change-Id: Ia20860e9c3c45368822cfd8877167ff0bf973dcc
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3587602
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
---
 README.chromium            |  2 +-
 include/libyuv/scale_row.h | 36 ++++++++++++++++++------------------
 include/libyuv/version.h   |  2 +-
 source/convert.cc          | 12 ++++++++----
 source/convert_argb.cc     | 24 ++++++++++++------------
 source/convert_from.cc     |  6 ++++--
 source/planar_functions.cc |  9 ++++++---
 source/scale_any.cc        | 12 ++++++------
 source/scale_gcc.cc        | 21 ++++++++++-----------
 source/scale_uv.cc         | 12 ++++++------
 10 files changed, 72 insertions(+), 64 deletions(-)

diff --git a/README.chromium b/README.chromium
index a96f05296..13fa8747f 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1819
+Version: 1820
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h
index 682b33428..cc1c90619 100644
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@@ -86,8 +86,8 @@ extern "C" {
 #define HAS_SCALEROWUP2BILINEAR_16_SSE2
 #define HAS_SCALEUVROWUP2LINEAR_SSSE3
 #define HAS_SCALEUVROWUP2BILINEAR_SSSE3
-#define HAS_SCALEUVROWUP2LINEAR_16_SSE2
-#define HAS_SCALEUVROWUP2BILINEAR_16_SSE2
+#define HAS_SCALEUVROWUP2LINEAR_16_SSE41
+#define HAS_SCALEUVROWUP2BILINEAR_16_SSE41
 #endif
 
 // The following are available for gcc/clang x86 platforms, but
@@ -1235,22 +1235,22 @@ void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
                                      uint8_t* dst_ptr,
                                      ptrdiff_t dst_stride,
                                      int dst_width);
-void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
-                                  uint16_t* dst_ptr,
-                                  int dst_width);
-void ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint16_t* dst_ptr,
-                                    ptrdiff_t dst_stride,
-                                    int dst_width);
-void ScaleUVRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
-                                      uint16_t* dst_ptr,
-                                      int dst_width);
-void ScaleUVRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr,
-                                        ptrdiff_t src_stride,
-                                        uint16_t* dst_ptr,
-                                        ptrdiff_t dst_stride,
-                                        int dst_width);
+void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
+                                   uint16_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint16_t* dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width);
+void ScaleUVRowUp2_Linear_16_Any_SSE41(const uint16_t* src_ptr,
+                                       uint16_t* dst_ptr,
+                                       int dst_width);
+void ScaleUVRowUp2_Bilinear_16_Any_SSE41(const uint16_t* src_ptr,
+                                         ptrdiff_t src_stride,
+                                         uint16_t* dst_ptr,
+                                         ptrdiff_t dst_stride,
+                                         int dst_width);
 void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   int dst_width);
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 475804369..f42a46b9c 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1819
+#define LIBYUV_VERSION 1820
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/convert.cc b/source/convert.cc
index 38f0a0a56..502f002d6 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -83,7 +83,8 @@ int I420Copy(const uint8_t* src_y,
              int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -125,7 +126,8 @@ int I010Copy(const uint16_t* src_y,
              int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -169,7 +171,8 @@ static int Planar16bitTo8bit(const uint16_t* src_y,
   int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
   int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
   int scale = 1 << (24 - depth);
-  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -539,7 +542,8 @@ int I422ToI210(const uint8_t* src_y,
                int width,
                int height) {
   int halfwidth = (width + 1) >> 1;
-  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index 11cda0787..942df30a0 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -6647,9 +6647,9 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y,
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE2;
+#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE41
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
   }
 #endif
 
@@ -6737,9 +6737,9 @@ static int P210ToARGBMatrixLinear(const uint16_t* src_y,
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE2;
+#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE41
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41;
   }
 #endif
 
@@ -6813,9 +6813,9 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y,
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE2;
+#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE41
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
   }
 #endif
 
@@ -6903,9 +6903,9 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y,
   }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE2;
+#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE41
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41;
   }
 #endif
 
diff --git a/source/convert_from.cc b/source/convert_from.cc
index 932a32b81..8bd07e4ce 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -85,7 +85,8 @@ int I420ToI010(const uint8_t* src_y,
                int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -129,7 +130,8 @@ int I420ToI012(const uint8_t* src_y,
                int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index a69792647..42fd9c51e 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -240,7 +240,8 @@ int I422Copy(const uint8_t* src_y,
              int height) {
   int halfwidth = (width + 1) >> 1;
 
-  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
 
@@ -279,7 +280,8 @@ int I444Copy(const uint8_t* src_y,
              int dst_stride_v,
              int width,
              int height) {
-  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -319,7 +321,8 @@ int I210Copy(const uint16_t* src_y,
              int height) {
   int halfwidth = (width + 1) >> 1;
 
-  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
 
diff --git a/source/scale_any.cc b/source/scale_any.cc
index 0f6c345d5..e820584b0 100644
--- a/source/scale_any.cc
+++ b/source/scale_any.cc
@@ -924,9 +924,9 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2,
           uint8_t)
 #endif
 
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
-SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE2,
-          ScaleUVRowUp2_Linear_16_SSE2,
+#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE41
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE41,
+          ScaleUVRowUp2_Linear_16_SSE41,
           ScaleUVRowUp2_Linear_16_C,
           3,
           uint16_t)
@@ -1022,9 +1022,9 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2,
           uint8_t)
 #endif
 
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
-SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE2,
-          ScaleUVRowUp2_Bilinear_16_SSE2,
+#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE41
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE41,
+          ScaleUVRowUp2_Bilinear_16_SSE41,
           ScaleUVRowUp2_Bilinear_16_C,
           7,
           uint16_t)
diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
index 0ac65f351..d827c0e7f 100644
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@@ -1285,7 +1285,6 @@ void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
       "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
       "packuswb    %%xmm2,%%xmm0                 \n"
       "movdqu      %%xmm0,(%1)                   \n"
-
       "lea         0x8(%0),%0                    \n"
       "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
       "sub         $0x10,%2                      \n"
@@ -2666,10 +2665,10 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
 }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
-void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
-                                  uint16_t* dst_ptr,
-                                  int dst_width) {
+#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE41
+void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
+                                   uint16_t* dst_ptr,
+                                   int dst_width) {
   asm volatile(
       "pxor        %%xmm5,%%xmm5                 \n"
       "pcmpeqd     %%xmm4,%%xmm4                 \n"
@@ -2716,12 +2715,12 @@ void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
 }
 #endif
 
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
-void ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint16_t* dst_ptr,
-                                    ptrdiff_t dst_stride,
-                                    int dst_width) {
+#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE41
+void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint16_t* dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width) {
   asm volatile(
       "pxor        %%xmm7,%%xmm7                 \n"
       "pcmpeqd     %%xmm6,%%xmm6                 \n"
diff --git a/source/scale_uv.cc b/source/scale_uv.cc
index 67cc26b80..f4b564277 100644
--- a/source/scale_uv.cc
+++ b/source/scale_uv.cc
@@ -747,9 +747,9 @@ void ScaleUVLinearUp2_16(int src_width,
   // This function can only scale up by 2 times horizontally.
   assert(src_width == ((dst_width + 1) / 2));
 
-#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE2;
+#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE41
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41;
   }
 #endif
 
@@ -800,9 +800,9 @@ void ScaleUVBilinearUp2_16(int src_width,
   assert(src_width == ((dst_width + 1) / 2));
   assert(src_height == ((dst_height + 1) / 2));
 
-#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE2;
+#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE41
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
   }
 #endif