ARGBAttenuate use (a + b + 255) >> 8

- Makes ARM and Intel match and fixes some off by 1 cases - Add ARGBToUV444MatrixRow_NEON - Add ConvertFP16ToFP32Column_NEON - scale_rvv fix intinsic build error - disable row_win version of ARGBAttenuate/Unattenuate Bug: libyuv:936, libyuv:956 Change-Id: Ied99aaad3a11a8eb69212b628c58f86ec0723c38 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4617013 Reviewed-by: Wan-Teh Chang <wtc@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
2025-12-07 01:06:46 +08:00 · 2023-06-16 14:13:54 -07:00 · 2023-06-16 14:13:54 -07:00 · a366ad714a
commit a366ad714a
parent 04821d1e7d
9 changed files with 388 additions and 175 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1872
+Version: 1873
 License: BSD
 License File: LICENSE
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -161,7 +161,6 @@ extern "C" {
 #define HAS_ARGBSEPIAROW_SSSE3
 #define HAS_ARGBSHADEROW_SSE2
 #define HAS_ARGBSUBTRACTROW_SSE2
 #define HAS_ARGBUNATTENUATEROW_SSE2
 #define HAS_BLENDPLANEROW_SSSE3
 #define HAS_COMPUTECUMULATIVESUMROW_SSE2
 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
@ -171,9 +170,6 @@ extern "C" {
 #define HAS_SOBELXROW_SSE2
 #define HAS_SOBELXYROW_SSE2
 #define HAS_SOBELYROW_SSE2
 #if !defined(LIBYUV_BIT_EXACT)
 #define HAS_ARGBATTENUATEROW_SSSE3
 #endif
 // The following functions fail on gcc/clang 32 bit with fpic and framepointer.
 // caveat: clangcl uses row_win.cc which works.
@ -241,11 +237,7 @@ extern "C" {
 #define HAS_ARGBADDROW_AVX2
 #define HAS_ARGBMULTIPLYROW_AVX2
 #define HAS_ARGBSUBTRACTROW_AVX2
 #define HAS_ARGBUNATTENUATEROW_AVX2
 #define HAS_BLENDPLANEROW_AVX2
 #if !defined(LIBYUV_BIT_EXACT)
 #define HAS_ARGBATTENUATEROW_AVX2
 #endif
 #if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
    defined(_MSC_VER)
@ -285,14 +277,15 @@ extern "C" {
 #define HAS_ABGRTOAR30ROW_SSSE3
 #define HAS_ABGRTOYJROW_SSSE3
 #define HAS_AR64TOARGBROW_SSSE3
 #define HAS_ARGBATTENUATEROW_SSSE3
 #define HAS_ARGBTOAB64ROW_SSSE3
 #define HAS_ARGBTOAR30ROW_SSSE3
 #define HAS_ARGBTOAR64ROW_SSSE3
 #define HAS_ARGBUNATTENUATEROW_SSE2
 #define HAS_CONVERT16TO8ROW_SSSE3
 #define HAS_CONVERT8TO16ROW_SSE2
 #define HAS_DETILEROW_SSE2
 #define HAS_DETILEROW_16_SSE2
-#define HAS_DETILEROW_16_AVX
+#define HAS_DETILEROW_SSE2
 #define HAS_DETILESPLITUVROW_SSSE3
 #define HAS_DETILETOYUY2_SSE2
 #define HAS_HALFMERGEUVROW_SSSE3
@ -345,13 +338,16 @@ extern "C" {
 #define HAS_ABGRTOYJROW_AVX2
 #define HAS_ABGRTOYROW_AVX2
 #define HAS_AR64TOARGBROW_AVX2
 #define HAS_ARGBATTENUATEROW_AVX2
 #define HAS_ARGBTOAB64ROW_AVX2
 #define HAS_ARGBTOAR30ROW_AVX2
 #define HAS_ARGBTOAR64ROW_AVX2
 #define HAS_ARGBTORAWROW_AVX2
 #define HAS_ARGBTORGB24ROW_AVX2
 #define HAS_ARGBUNATTENUATEROW_AVX2
 #define HAS_CONVERT16TO8ROW_AVX2
 #define HAS_CONVERT8TO16ROW_AVX2
 #define HAS_DETILEROW_16_AVX
 #define HAS_DIVIDEROW_16_AVX2
 #define HAS_HALFMERGEUVROW_AVX2
 #define HAS_I210TOAR30ROW_AVX2
@ -6190,6 +6186,11 @@ void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr,
 void ConvertFP16ToFP32Row_NEON(const uint16_t* src,  // fp16
                               float* dst,
                               int width);
 // Convert a column of FP16 Half Floats to a row of FP32 Floats
 void ConvertFP16ToFP32Column_NEON(const uint16_t* src,  // fp16
                                  int src_stride,       // stride in elements
                                  float* dst,
                                  int width);
 // Convert FP32 Floats to FP16 Half Floats
 void ConvertFP32ToFP16Row_NEON(const float* src,
                               uint16_t* dst,  // fp16
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1872
+#define LIBYUV_VERSION 1873
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -48,7 +48,6 @@ extern "C" {
                                   defined(__i386__) || defined(_M_IX86))
 #define LIBYUV_ARGBTOUV_PAVGB 1
 #define LIBYUV_RGBTOU_TRUNCATE 1
 #define LIBYUV_ATTENUATE_DUP 1
 #endif
 #if defined(LIBYUV_BIT_EXACT)
 #define LIBYUV_UNATTENUATE_DUP 1
@ -3369,12 +3368,7 @@ void BlendPlaneRow_C(const uint8_t* src0,
 }
 #undef UBLEND
-#if LIBYUV_ATTENUATE_DUP
+#define ATTENUATE(f, a) (f * a + 255) >> 8
 // This code mimics the SSSE3 version for better testability.
 #define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
 #else
 #define ATTENUATE(f, a) (f * a + 128) >> 8
 #endif
 // Multiply source RGB by alpha and store to destination.
 void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -7441,83 +7441,95 @@ void BlendPlaneRow_AVX2(const uint8_t* src0,
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
+static const vec8 kAttenuateShuffle = {6,    -128, 6,    -128, 6,  -128,
-                                     7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
+                                       -128, -128, 14,   -128, 14, -128,
-static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+                                       14,   -128, -128, -128};
-                                     15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
+
 // Attenuate 4 pixels at a time.
 void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int width) {
  asm volatile(
      "pcmpeqb     %%xmm3,%%xmm3                 \n"
      "pslld       $0x18,%%xmm3                  \n"
      "movdqa      %3,%%xmm4                     \n"
-      "movdqa      %4,%%xmm5                     \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
      "pslld       $0x18,%%xmm5                  \n"
      "pxor        %%xmm6,%%xmm6                 \n"
      "pcmpeqb     %%xmm7,%%xmm7                 \n"
      "punpcklbw   %%xmm6,%%xmm7                 \n"
      "sub         %0,%1                         \n"
      // 4 pixel loop.
      LABELALIGN
      "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      (%0),%%xmm6                   \n"
-      "pshufb      %%xmm4,%%xmm0                 \n"
+      "movdqa      %%xmm6,%%xmm0                 \n"
-      "movdqu      (%0),%%xmm1                   \n"
+      "movdqa      %%xmm6,%%xmm1                 \n"
-      "punpcklbw   %%xmm1,%%xmm1                 \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
-      "pmulhuw     %%xmm1,%%xmm0                 \n"
+      "punpckhbw   %%xmm5,%%xmm1                 \n"
-      "movdqu      (%0),%%xmm1                   \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
-      "pshufb      %%xmm5,%%xmm1                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
-      "movdqu      (%0),%%xmm2                   \n"
+      "pshufb      %%xmm4,%%xmm2                 \n"  // a,a,a,0
-      "punpckhbw   %%xmm2,%%xmm2                 \n"
+      "pshufb      %%xmm4,%%xmm3                 \n"
-      "pmulhuw     %%xmm2,%%xmm1                 \n"
+      "pmullw      %%xmm2,%%xmm0                 \n"  // rgb * alpha
-      "movdqu      (%0),%%xmm2                   \n"
+      "pmullw      %%xmm3,%%xmm1                 \n"
-      "lea         0x10(%0),%0                   \n"
+      "paddw       %%xmm7,%%xmm0                 \n"  // + 255
-      "pand        %%xmm3,%%xmm2                 \n"
+      "paddw       %%xmm7,%%xmm1                 \n"
      "psrlw       $0x8,%%xmm0                   \n"
      "psrlw       $0x8,%%xmm1                   \n"
      "packuswb    %%xmm1,%%xmm0                 \n"
-      "por         %%xmm2,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm6                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
+      "por         %%xmm6,%%xmm0                 \n"
-      "lea         0x10(%1),%1                   \n"
+      "movdqu      %%xmm0,(%0,%1)                \n"
      "lea         0x10(%0),%0                   \n"
      "sub         $0x4,%2                       \n"
      "jg          1b                            \n"
      : "+r"(src_argb),         // %0
        "+r"(dst_argb),         // %1
        "+r"(width)             // %2
-      : "m"(kShuffleAlpha0),  // %3
+      : "m"(kAttenuateShuffle)  // %3
-        "m"(kShuffleAlpha1)   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+        "xmm7");
 }
 #endif  // HAS_ARGBATTENUATEROW_SSSE3
 #ifdef HAS_ARGBATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
+static const lvec8 kAttenuateShuffle_AVX2 = {
-                                         128u, 128u, 14u,  15u, 14u, 15u,
+    6,    -128, 6,    -128, 6,    -128, -128, -128, 14,   -128, 14,
-                                         14u,  15u,  128u, 128u};
+    -128, 14,   -128, -128, -128, 22,   -128, 22,   -128, 22,   -128,
    -128, -128, 30,   -128, 30,   -128, 30,   -128, -128, -128};
 // Attenuate 8 pixels at a time.
 void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           int width) {
  asm volatile(
-      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vmovdqa     %3,%%ymm4                     \n"
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
      "vpslld      $0x18,%%ymm5,%%ymm5           \n"
      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"
      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"
      "vpunpcklbw  %%ymm6,%%ymm7,%%ymm7          \n"
      "sub         %0,%1                         \n"
      // 8 pixel loop.
      LABELALIGN
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm6                   \n"
-      "vpunpcklbw  %%ymm6,%%ymm6,%%ymm0          \n"
+      "vpunpcklbw  %%ymm5,%%ymm6,%%ymm0          \n"
-      "vpunpckhbw  %%ymm6,%%ymm6,%%ymm1          \n"
+      "vpunpckhbw  %%ymm5,%%ymm6,%%ymm1          \n"
      "vpshufb     %%ymm4,%%ymm0,%%ymm2          \n"
      "vpshufb     %%ymm4,%%ymm1,%%ymm3          \n"
-      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmullw     %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
-      "vpand       %%ymm5,%%ymm6,%%ymm6          \n"
+      "vpaddw      %%ymm7,%%ymm0,%%ymm0          \n"
      "vpaddw      %%ymm7,%%ymm1,%%ymm1          \n"
      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpor        %%ymm6,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm6,%%ymm1          \n"
      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"
      "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
      "lea         0x20(%0),%0                   \n"
      "sub         $0x8,%2                       \n"
@ -7526,8 +7538,9 @@ void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
      : "+r"(src_argb),              // %0
        "+r"(dst_argb),              // %1
        "+r"(width)                  // %2
-      : "m"(kShuffleAlpha_AVX2)  // %3
+      : "m"(kAttenuateShuffle_AVX2)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
 }
 #endif  // HAS_ARGBATTENUATEROW_AVX2
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -1827,19 +1827,27 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
  );
 }
 struct RgbUVConstants {
  uint8_t kRGBToU[4];
  uint8_t kRGBToV[4];
 };
 // 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
                               uint8_t* dst_u,
                               uint8_t* dst_v,
-                         int width) {
+                               int width,
                               const struct RgbUVConstants* rgbuvconstants) {
  asm volatile(
-      "vmov.u8     d24, #112                     \n"  // UB / VR 0.875
+
-                                                      // coefficient
+      "vld1.8      {d0}, [%4]                    \n"  // load rgbuvconstants
-      "vmov.u8     d25, #74                      \n"  // UG -0.5781 coefficient
+      "vdup.u8     d24, d0[0]                    \n"  // UB  0.875  coefficient
-      "vmov.u8     d26, #38                      \n"  // UR -0.2969 coefficient
+      "vdup.u8     d25, d0[1]                    \n"  // UG -0.5781 coefficient
-      "vmov.u8     d27, #18                      \n"  // VB -0.1406 coefficient
+      "vdup.u8     d26, d0[2]                    \n"  // UR -0.2969 coefficient
-      "vmov.u8     d28, #94                      \n"  // VG -0.7344 coefficient
+      "vdup.u8     d27, d0[4]                    \n"  // VB -0.1406 coefficient
      "vdup.u8     d28, d0[5]                    \n"  // VG -0.7344 coefficient
      "vmov.u16    q15, #0x8080                  \n"  // 128.5
      "1:                                        \n"
      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
@ -1861,11 +1869,49 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
        "+r"(dst_u),         // %1
        "+r"(dst_v),         // %2
        "+r"(width)          // %3
-      :
+      : "r"(rgbuvconstants)  // %4
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
        "q15");
 }
 // RGB to bt601 coefficients
 // UB   0.875 coefficient = 112
 // UG -0.5781 coefficient = 74
 // UR -0.2969 coefficient = 38
 // VB -0.1406 coefficient = 18
 // VG -0.7344 coefficient = 94
 // VR   0.875 coefficient = 112 (ignored)
 static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
                                                            {18, 94, 112, 0}};
 // RGB to JPeg coefficients
 // UB coeff 0.500    = 127
 // UG coeff -0.33126 = 84
 // UR coeff -0.16874 = 43
 // VB coeff -0.08131 = 20
 // VG coeff -0.41869 = 107
 // VR coeff 0.500    = 127 (ignored)
 static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0},
                                                            {20, 107, 127, 0}};
 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
                            &kRgb24I601UVConstants);
 }
 void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
                            &kRgb24JPegUVConstants);
 }
 // clang-format off
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
 #define RGBTOUV(QB, QG, QR)                                                 \
@ -2702,7 +2748,6 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
 struct RgbConstants {
  uint8_t kRGBToY[4];
  uint16_t kAddY;
  uint16_t pad;
 };
 // RGB to JPeg coefficients
@ -2710,11 +2755,9 @@ struct RgbConstants {
 // G * 0.5870 coefficient = 150
 // R * 0.2990 coefficient = 77
 // Add 0.5 = 0x80
-static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
                                                        128,
                                                        0};
-static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
 // RGB to BT.601 coefficients
 // B * 0.1016 coefficient = 25
@ -2723,12 +2766,9 @@ static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
 // Add 16.5 = 0x1080
 static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
-                                                        0x1080,
+                                                        0x1080};
                                                        0};
-static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
                                                      0x1080,
                                                      0};
 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
 void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
@ -3058,6 +3098,8 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           int width) {
  asm volatile(
      "vmov.u16    q15, #0x00ff                  \n"  // 255 for rounding up
      // Attenuate 8 pixels.
      "1:                                        \n"
      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of ARGB.
@ -3065,16 +3107,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
      "vmull.u8    q10, d0, d3                   \n"  // b * a
      "vmull.u8    q11, d1, d3                   \n"  // g * a
      "vmull.u8    q12, d2, d3                   \n"  // r * a
-      "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
+      "vaddhn.u16  d0, q10, q15                  \n"  // (b + 255) >> 8
-      "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
+      "vaddhn.u16  d1, q11, q15                  \n"  // (g + 255) >> 8
-      "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
+      "vaddhn.u16  d2, q12, q15                  \n"  // (r + 255) >> 8
      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
      "bgt         1b                            \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_argb),  // %1
        "+r"(width)      // %2
      :
-      : "cc", "memory", "q0", "q1", "q10", "q11", "q12");
+      : "cc", "memory", "q0", "q1", "q10", "q11", "q12", "q15");
 }
 // Quantize 8 ARGB pixels (32 bytes).
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -2198,19 +2198,26 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
  );
 }
 struct RgbUVConstants {
  uint8_t kRGBToU[4];
  uint8_t kRGBToV[4];
 };
 // 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
                               uint8_t* dst_u,
                               uint8_t* dst_v,
-                         int width) {
+                               int width,
                               const struct RgbUVConstants* rgbuvconstants) {
  asm volatile(
-      "movi        v24.8b, #112                  \n"  // UB / VR 0.875
+      "ldr         d0, [%4]                      \n"  // load rgbuvconstants
-                                                      // coefficient
+      "dup         v24.16b, v0.b[0]              \n"  // UB  0.875 coefficient
-      "movi        v25.8b, #74                   \n"  // UG -0.5781 coefficient
+      "dup         v25.16b, v0.b[1]              \n"  // UG -0.5781 coefficient
-      "movi        v26.8b, #38                   \n"  // UR -0.2969 coefficient
+      "dup         v26.16b, v0.b[2]              \n"  // UR -0.2969 coefficient
-      "movi        v27.8b, #18                   \n"  // VB -0.1406 coefficient
+      "dup         v27.16b, v0.b[4]              \n"  // VB -0.1406 coefficient
-      "movi        v28.8b, #94                   \n"  // VG -0.7344 coefficient
+      "dup         v28.16b, v0.b[5]              \n"  // VG -0.7344 coefficient
      "movi        v29.16b, #0x80                \n"  // 128.5
      "1:                                        \n"
      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
@ -2233,11 +2240,49 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
        "+r"(dst_u),         // %1
        "+r"(dst_v),         // %2
        "+r"(width)          // %3
-      :
+      : "r"(rgbuvconstants)  // %4
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
        "v27", "v28", "v29");
 }
 // RGB to bt601 coefficients
 // UB   0.875 coefficient = 112
 // UG -0.5781 coefficient = 74
 // UR -0.2969 coefficient = 38
 // VB -0.1406 coefficient = 18
 // VG -0.7344 coefficient = 94
 // VR   0.875 coefficient = 112 (ignored)
 static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
                                                            {18, 94, 112, 0}};
 // RGB to JPeg coefficients
 // UB coeff 0.500    = 127
 // UG coeff -0.33126 = 84
 // UR coeff -0.16874 = 43
 // VB coeff -0.08131 = 20
 // VG coeff -0.41869 = 107
 // VR coeff 0.500    = 127 (ignored)
 static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0},
                                                            {20, 107, 127, 0}};
 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
                            &kRgb24I601UVConstants);
 }
 void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
                            &kRgb24JPegUVConstants);
 }
 #define RGBTOUV_SETUP_REG                                                  \
  "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
  "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
@ -2943,34 +2988,8 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
 struct RgbConstants {
  uint8_t kRGBToY[4];
  uint16_t kAddY;
  uint16_t pad;
 };
 // RGB to JPeg coefficients
 // B * 0.1140 coefficient = 29
 // G * 0.5870 coefficient = 150
 // R * 0.2990 coefficient = 77
 // Add 0.5 = 0x80
 static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
                                                        128,
                                                        0};
 static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
 // RGB to BT.601 coefficients
 // B * 0.1016 coefficient = 25
 // G * 0.5078 coefficient = 129
 // R * 0.2578 coefficient = 66
 // Add 16.5 = 0x1080
 static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
                                                        0x1080,
                                                        0};
 static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
                                                      0x1080,
                                                      0};
 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
 void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
                           uint8_t* dst_y,
@ -3005,6 +3024,26 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
        "v17");
 }
 // RGB to JPeg coefficients
 // B * 0.1140 coefficient = 29
 // G * 0.5870 coefficient = 150
 // R * 0.2990 coefficient = 77
 // Add 0.5 = 0x80
 static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
 static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
 // RGB to BT.601 coefficients
 // B * 0.1016 coefficient = 25
 // G * 0.5078 coefficient = 129
 // R * 0.2578 coefficient = 66
 // Add 16.5 = 0x1080
 static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
                                                        0x1080};
 static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
 }
@ -3402,6 +3441,8 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           int width) {
  asm volatile(
      "movi        v7.8h, #0x00ff                \n"  // 255 for rounding up
      // Attenuate 8 pixels.
      "1:                                        \n"
      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
@ -3410,16 +3451,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
      "prfm        pldl1keep, [%0, 448]          \n"
      "umull       v5.8h, v1.8b, v3.8b           \n"  // g * a
      "umull       v6.8h, v2.8b, v3.8b           \n"  // r * a
-      "uqrshrn     v0.8b, v4.8h, #8              \n"         // b >>= 8
+      "addhn       v0.8b, v4.8h, v7.8h           \n"         // (b + 255) >> 8
-      "uqrshrn     v1.8b, v5.8h, #8              \n"         // g >>= 8
+      "addhn       v1.8b, v5.8h, v7.8h           \n"         // (g + 255) >> 8
-      "uqrshrn     v2.8b, v6.8h, #8              \n"         // r >>= 8
+      "addhn       v2.8b, v6.8h, v7.8h           \n"         // (r + 255) >> 8
      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
      "b.gt        1b                            \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_argb),  // %1
        "+r"(width)      // %2
      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 // Quantize 8 ARGB pixels (32 bytes).
@ -3980,6 +4021,46 @@ void ConvertFP16ToFP32Row_NEON(const uint16_t* src,  // fp16
      : "cc", "memory", "v1", "v2", "v3");
 }
 // Convert FP16 Half Floats to FP32 Floats
 // Read a column and write a row
 void ConvertFP16ToFP32Column_NEON(const uint16_t* src,  // fp16
                                  int src_stride,       // stride in elements
                                  float* dst,
                                  int width) {
  asm volatile(
      "cmp         %w2, #8                       \n"  // Is there 8 rows?
      "b.lo        2f                            \n"
      "1:                                        \n"
      "ld1         {v0.h}[0], [%0], %3           \n"  // load 8 halffloats
      "ld1         {v0.h}[1], [%0], %3           \n"
      "ld1         {v0.h}[2], [%0], %3           \n"
      "ld1         {v0.h}[3], [%0], %3           \n"
      "ld1         {v1.h}[0], [%0], %3           \n"
      "ld1         {v1.h}[1], [%0], %3           \n"
      "ld1         {v1.h}[2], [%0], %3           \n"
      "ld1         {v1.h}[3], [%0], %3           \n"
      "subs        %w2, %w2, #8                  \n"  // 8 rows per loop
      "prfm        pldl1keep, [%0, 448]          \n"
      "fcvtl       v2.4s, v0.4h                  \n"  // 4 floats
      "fcvtl       v3.4s, v1.4h                  \n"  // 4 more floats
      "stp         q2, q3, [%1], #32             \n"  // store 8 floats
      "b.gt        1b                            \n"
      "cmp         %w2, #1                       \n"  // Is there 1 value?
      "b.lo        3f                            \n"
      "2:                                        \n"
      "ld1         {v1.h}[0], [%0], %3           \n"  // load 1 halffloats
      "subs        %w2, %w2, #1                  \n"  // 1 floats per loop
      "fcvtl       v2.4s, v1.4h                  \n"  // 1 floats
      "str         s2, [%1], #4                  \n"  // store 1 floats
      "b.gt        2b                            \n"
      "3:                                        \n"
      : "+r"(src),                        // %0
        "+r"(dst),                        // %1
        "+r"(width)                       // %2
      : "r"((ptrdiff_t)(src_stride * 2))  // %3
      : "cc", "memory", "v0", "v1", "v2", "v3");
 }
 // Convert FP32 Floats to FP16 Half Floats
 void ConvertFP32ToFP16Row_NEON(const float* src,
                               uint16_t* dst,  // fp16
--- a/source/scale_rvv.cc
+++ b/source/scale_rvv.cc
@ -75,7 +75,6 @@ void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
  asm volatile("csrwi vxrm, 0");
  do {
    vuint8m4_t v_odd, v_even, v_dst;
    vuint16m8_t v_sum;
    vuint32m4_t v_odd_32, v_even_32;
    size_t vl = __riscv_vsetvl_e32m4(w);
    __riscv_vlseg2e32_v_u32m4(&v_even_32, &v_odd_32, src, vl);
@ -499,7 +498,7 @@ void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv,
    vuint8m4_t v_u0v0, v_u1v1, v_avg;
    vuint16m4_t v_u0v0_16, v_u1v1_16;
    size_t vl = __riscv_vsetvl_e16m4(w);
-    vlseg2e16_v_u16m4(&v_u0v0_16, &v_u1v1_16, src, vl);
+    __riscv_vlseg2e16_v_u16m4(&v_u0v0_16, &v_u1v1_16, src, vl);
    v_u0v0 = __riscv_vreinterpret_v_u16m4_u8m4(v_u0v0_16);
    v_u1v1 = __riscv_vreinterpret_v_u16m4_u8m4(v_u1v1_16);
    // Use round-to-nearest-up mode for averaging add
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -30,9 +30,9 @@
 #endif
 #if defined(LIBYUV_BIT_EXACT)
-#define EXPECTED_ATTENUATE_DIFF 0
+#define EXPECTED_UNATTENUATE_DIFF 0
 #else
-#define EXPECTED_ATTENUATE_DIFF 2
+#define EXPECTED_UNATTENUATE_DIFF 2
 #endif
 namespace libyuv {
@ -57,12 +57,17 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
  orig_pixels[2 * 4 + 0] = 16u;
  orig_pixels[2 * 4 + 1] = 64u;
  orig_pixels[2 * 4 + 2] = 192u;
-  orig_pixels[2 * 4 + 3] = 255u;
+  orig_pixels[2 * 4 + 3] = 128u;
  orig_pixels[3 * 4 + 0] = 16u;
  orig_pixels[3 * 4 + 1] = 64u;
  orig_pixels[3 * 4 + 2] = 192u;
-  orig_pixels[3 * 4 + 3] = 128u;
+  orig_pixels[3 * 4 + 3] = 255u;
-  ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 4, 1);
+  orig_pixels[4 * 4 + 0] = 255u;
  orig_pixels[4 * 4 + 1] = 255u;
  orig_pixels[4 * 4 + 2] = 255u;
  orig_pixels[4 * 4 + 3] = 255u;
  ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 5, 1);
  EXPECT_EQ(255u, unatten_pixels[0 * 4 + 0]);
  EXPECT_EQ(255u, unatten_pixels[0 * 4 + 1]);
  EXPECT_EQ(254u, unatten_pixels[0 * 4 + 2]);
@ -71,14 +76,55 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 1]);
  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 2]);
  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 3]);
-  EXPECT_EQ(16u, unatten_pixels[2 * 4 + 0]);
+  EXPECT_EQ(32u, unatten_pixels[2 * 4 + 0]);
-  EXPECT_EQ(64u, unatten_pixels[2 * 4 + 1]);
+  EXPECT_EQ(128u, unatten_pixels[2 * 4 + 1]);
-  EXPECT_EQ(192u, unatten_pixels[2 * 4 + 2]);
+  EXPECT_EQ(255u, unatten_pixels[2 * 4 + 2]);
-  EXPECT_EQ(255u, unatten_pixels[2 * 4 + 3]);
+  EXPECT_EQ(128u, unatten_pixels[2 * 4 + 3]);
-  EXPECT_EQ(32u, unatten_pixels[3 * 4 + 0]);
+  EXPECT_EQ(16u, unatten_pixels[3 * 4 + 0]);
-  EXPECT_EQ(128u, unatten_pixels[3 * 4 + 1]);
+  EXPECT_EQ(64u, unatten_pixels[3 * 4 + 1]);
-  EXPECT_EQ(255u, unatten_pixels[3 * 4 + 2]);
+  EXPECT_EQ(192u, unatten_pixels[3 * 4 + 2]);
-  EXPECT_EQ(128u, unatten_pixels[3 * 4 + 3]);
+  EXPECT_EQ(255u, unatten_pixels[3 * 4 + 3]);
  EXPECT_EQ(255u, unatten_pixels[4 * 4 + 0]);
  EXPECT_EQ(255u, unatten_pixels[4 * 4 + 1]);
  EXPECT_EQ(255u, unatten_pixels[4 * 4 + 2]);
  EXPECT_EQ(255u, unatten_pixels[4 * 4 + 3]);
  ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 5, 1);
  EXPECT_EQ(100u, atten_pixels[0 * 4 + 0]);
  EXPECT_EQ(65u, atten_pixels[0 * 4 + 1]);
  EXPECT_EQ(64u, atten_pixels[0 * 4 + 2]);
  EXPECT_EQ(128u, atten_pixels[0 * 4 + 3]);
  EXPECT_EQ(0u, atten_pixels[1 * 4 + 0]);
  EXPECT_EQ(0u, atten_pixels[1 * 4 + 1]);
  EXPECT_EQ(0u, atten_pixels[1 * 4 + 2]);
  EXPECT_EQ(0u, atten_pixels[1 * 4 + 3]);
  EXPECT_EQ(8u, atten_pixels[2 * 4 + 0]);
  EXPECT_EQ(32u, atten_pixels[2 * 4 + 1]);
  EXPECT_EQ(96u, atten_pixels[2 * 4 + 2]);
  EXPECT_EQ(128u, atten_pixels[2 * 4 + 3]);
  EXPECT_EQ(16u, atten_pixels[3 * 4 + 0]);
  EXPECT_EQ(64u, atten_pixels[3 * 4 + 1]);
  EXPECT_EQ(192u, atten_pixels[3 * 4 + 2]);
  EXPECT_EQ(255u, atten_pixels[3 * 4 + 3]);
  EXPECT_EQ(255u, atten_pixels[4 * 4 + 0]);
  EXPECT_EQ(255u, atten_pixels[4 * 4 + 1]);
  EXPECT_EQ(255u, atten_pixels[4 * 4 + 2]);
  EXPECT_EQ(255u, atten_pixels[4 * 4 + 3]);
  // test 255
  for (int i = 0; i < 256; ++i) {
    orig_pixels[i * 4 + 0] = i;
    orig_pixels[i * 4 + 1] = 0;
    orig_pixels[i * 4 + 2] = 0;
    orig_pixels[i * 4 + 3] = 255;
  }
  ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 256, 1);
  for (int i = 0; i < 256; ++i) {
    EXPECT_EQ(orig_pixels[i * 4 + 0], atten_pixels[i * 4 + 0]);
    EXPECT_EQ(0, atten_pixels[i * 4 + 1]);
    EXPECT_EQ(0, atten_pixels[i * 4 + 2]);
    EXPECT_EQ(255, atten_pixels[i * 4 + 3]);
  }
  for (int i = 0; i < 1280; ++i) {
    orig_pixels[i * 4 + 0] = i;
@ -92,10 +138,10 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
    ARGBAttenuate(unatten_pixels, 0, atten2_pixels, 0, 1280, 1);
  }
  for (int i = 0; i < 1280; ++i) {
-    EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 2);
+    EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 1);
-    EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 2);
+    EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 1);
-    EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 2);
+    EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 1);
-    EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 2);
+    EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 1);
  }
  // Make sure transparent, 50% and opaque are fully accurate.
  EXPECT_EQ(0, atten_pixels[0 * 4 + 0]);
@ -106,9 +152,9 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
  EXPECT_EQ(32, atten_pixels[128 * 4 + 1]);
  EXPECT_EQ(21, atten_pixels[128 * 4 + 2]);
  EXPECT_EQ(128, atten_pixels[128 * 4 + 3]);
-  EXPECT_NEAR(254, atten_pixels[255 * 4 + 0], EXPECTED_ATTENUATE_DIFF);
+  EXPECT_EQ(255, atten_pixels[255 * 4 + 0]);
-  EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], EXPECTED_ATTENUATE_DIFF);
+  EXPECT_EQ(127, atten_pixels[255 * 4 + 1]);
-  EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], EXPECTED_ATTENUATE_DIFF);
+  EXPECT_EQ(85, atten_pixels[255 * 4 + 2]);
  EXPECT_EQ(255, atten_pixels[255 * 4 + 3]);
  free_aligned_buffer_page_end(atten2_pixels);
@ -165,28 +211,28 @@ TEST_F(LibYUVPlanarTest, ARGBAttenuate_Any) {
                                benchmark_iterations_, disable_cpu_flags_,
                                benchmark_cpu_info_, +1, 0);
-  EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
+  EXPECT_EQ(max_diff, 0);
 }
 TEST_F(LibYUVPlanarTest, ARGBAttenuate_Unaligned) {
  int max_diff =
      TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
                     disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
-  EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
+  EXPECT_EQ(max_diff, 0);
 }
 TEST_F(LibYUVPlanarTest, ARGBAttenuate_Invert) {
  int max_diff =
      TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
                     disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
-  EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
+  EXPECT_EQ(max_diff, 0);
 }
 TEST_F(LibYUVPlanarTest, ARGBAttenuate_Opt) {
  int max_diff =
      TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
                     disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
-  EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
+  EXPECT_EQ(max_diff, 0);
 }
 static int TestUnattenuateI(int width,
@ -238,28 +284,28 @@ TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) {
  int max_diff = TestUnattenuateI(benchmark_width_ + 1, benchmark_height_,
                                  benchmark_iterations_, disable_cpu_flags_,
                                  benchmark_cpu_info_, +1, 0);
-  EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
+  EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
 }
 TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) {
  int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
                                  benchmark_iterations_, disable_cpu_flags_,
                                  benchmark_cpu_info_, +1, 1);
-  EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
+  EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
 }
 TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) {
  int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
                                  benchmark_iterations_, disable_cpu_flags_,
                                  benchmark_cpu_info_, -1, 0);
-  EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
+  EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
 }
 TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) {
  int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
                                  benchmark_iterations_, disable_cpu_flags_,
                                  benchmark_cpu_info_, +1, 0);
-  EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF);
+  EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
 }
 TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
@ -2764,8 +2810,8 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
  }
  opt_time = (get_time() - opt_time) / benchmark_iterations_;
  // Report performance of C vs OPT
-  printf("%8d us C - %8d us OPT\n",
+  printf("%8d us C - %8d us OPT\n", static_cast<int>(c_time * 1e6),
-         static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+         static_cast<int>(opt_time * 1e6));
  for (int i = 0; i < kPixels; ++i) {
    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
  }
@ -2804,8 +2850,8 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) {
  opt_time = (get_time() - opt_time) / benchmark_iterations_;
  // Report performance of C vs OPT
-  printf("%8d us C - %8d us OPT\n",
+  printf("%8d us C - %8d us OPT\n", static_cast<int>(c_time * 1e6),
-         static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+         static_cast<int>(opt_time * 1e6));
  for (int i = 0; i < kPixels * 4; ++i) {
    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
  }
@ -4531,6 +4577,43 @@ TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32) {
  free_aligned_buffer_page_end(rec_opt);
 }
 TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32Column) {
  int i, j;
  const int y_plane_size = benchmark_width_ * benchmark_height_;
  align_buffer_page_end(orig_f, y_plane_size * 4);
  align_buffer_page_end(orig_y, y_plane_size * 2);
  align_buffer_page_end(dst_opt, y_plane_size * 4);
  align_buffer_page_end(rec_opt, y_plane_size * 2);
  for (i = 0; i < y_plane_size; ++i) {
    ((float*)orig_f)[i] = (float)(i % 10000) * 3.14f;
  }
  memset(orig_y, 1, y_plane_size * 2);
  memset(dst_opt, 2, y_plane_size * 4);
  memset(rec_opt, 3, y_plane_size * 2);
  ConvertFP32ToFP16Row_NEON((const float*)orig_f, (uint16_t*)orig_y,
                            y_plane_size);
  for (j = 0; j < benchmark_iterations_; j++) {
    ConvertFP16ToFP32Column_NEON((const uint16_t*)orig_y, 1, (float*)dst_opt,
                                 y_plane_size);
  }
  ConvertFP32ToFP16Row_NEON((const float*)dst_opt, (uint16_t*)rec_opt,
                            y_plane_size);
  for (i = 0; i < y_plane_size; ++i) {
    EXPECT_EQ(((const uint16_t*)orig_y)[i], ((const uint16_t*)rec_opt)[i]);
  }
  free_aligned_buffer_page_end(orig_f);
  free_aligned_buffer_page_end(orig_y);
  free_aligned_buffer_page_end(dst_opt);
  free_aligned_buffer_page_end(rec_opt);
 }
 #endif  // defined(ENABLE_ROW_TESTS) && defined(__aarch64__)
 }  // namespace libyuv