diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 40272cf5a..1efb6ccc8 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -433,7 +433,9 @@ extern "C" { #define HAS_ARGBTOUV444ROW_NEON #define HAS_ARGBTOUVJ444ROW_NEON #define HAS_ARGBTOUVJROW_NEON +#if !defined(__GNUC__) || defined(__clang__) #define HAS_ARGBTOUVMATRIXROW_NEON +#endif #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON #if !defined(__aarch64__) diff --git a/source/convert.cc b/source/convert.cc index d9fb54778..c64042691 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -2263,12 +2263,16 @@ ARGBToUVMatrixRow_C; #endif #if defined(HAS_ARGBTOUVMATRIXROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; + } } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SME) if (TestCpuFlag(kCpuHasSME)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; + if (IS_ALIGNED(width, 2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; + } } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 2c66611e6..db45f4794 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -592,12 +592,16 @@ ARGBToUVMatrixRow_C; #endif #if defined(HAS_ARGBTOUVMATRIXROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; + } } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SME) if (TestCpuFlag(kCpuHasSME)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; + if (IS_ALIGNED(width, 2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; + } } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) @@ -959,12 +963,16 @@ ARGBToUVMatrixRow_C; #endif #if defined(HAS_ARGBTOUVMATRIXROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; + } } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SME) if (TestCpuFlag(kCpuHasSME)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; + if (IS_ALIGNED(width, 2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; + } } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) @@ -4289,12 +4297,16 @@ int RAWToNV21Matrix(const uint8_t* src_raw, #endif #if defined(HAS_ARGBTOUVMATRIXROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; + } } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SME) if (TestCpuFlag(kCpuHasSME)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; + if (IS_ALIGNED(width, 2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; + } } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) diff --git a/source/row_any.cc b/source/row_any.cc index 82a4abe8d..81e0f44fb 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2201,6 +2201,7 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15) SIMD_ALIGNED(uint8_t vin[256 * 2]); \ SIMD_ALIGNED(uint8_t vout[256 * 2]); \ memset(vin, 0, sizeof(vin)); /* for msan */ \ + memset(vout, 0, sizeof(vout)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ @@ -2244,6 +2245,7 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15) SIMD_ALIGNED(uint8_t vin[256 * 2]); \ SIMD_ALIGNED(uint8_t vout[256 * 2]); \ memset(vin, 0, sizeof(vin)); /* for msan */ \ + memset(vout, 0, sizeof(vout)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ diff --git a/source/row_neon.cc b/source/row_neon.cc index 895e6f113..d893dd7e3 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1924,8 +1924,8 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct ArgbConstants* c) { + const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb "vld1.8 {d18}, [%5] \n" // load kRGBToU "vld1.8 {d19}, [%6] \n" // load kRGBToV "vmovl.s8 q8, d18 \n" // U coeffs in q8 (d16, d17) @@ -1936,6 +1936,7 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, "vdup.16 q13, d18[0] \n" // V0 "vdup.16 q14, d18[1] \n" // V1 "vdup.16 q15, d18[2] \n" // V2 + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. @@ -1963,17 +1964,14 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, "vmla.s16 q9, q1, q14 \n" // V += G * V1 "vmla.s16 q9, q2, q15 \n" // V += R * V2 - "vsub.u16 q8, q3, q8 \n" // 128.0 - U - "vsub.u16 q9, q3, q9 \n" // 128.0 - V - - "vqshrn.u16 d0, q8, #8 \n" // Saturating shift right - "vqshrn.u16 d1, q9, #8 \n" + "vsubhn.s16 d0, q3, q8 \n" // 128.0 - U + "vsubhn.s16 d1, q3, q9 \n" // 128.0 - V "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 - "+r"(src_stride_argb), // %1 + "+r"(src_argb_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 diff --git a/unit_test/convert_argb_test.cc b/unit_test/convert_argb_test.cc index 177f3a669..e309b38bb 100644 --- a/unit_test/convert_argb_test.cc +++ b/unit_test/convert_argb_test.cc @@ -2809,6 +2809,46 @@ TEST_F(LibYUVConvertTest, TestARGBToUVRow) { } #endif +#ifdef ENABLE_ROW_TESTS +TEST_F(LibYUVConvertTest, TestARGBToUVMatrixRow_Opt) { + const int kMaxWidth = 128; + SIMD_ALIGNED(uint8_t orig_argb_pixels[kMaxWidth * 4 * 2]); + SIMD_ALIGNED(uint8_t dest_u_c[kMaxWidth]); + SIMD_ALIGNED(uint8_t dest_v_c[kMaxWidth]); + SIMD_ALIGNED(uint8_t dest_u_opt[kMaxWidth]); + SIMD_ALIGNED(uint8_t dest_v_opt[kMaxWidth]); + + for (int i = 0; i < kMaxWidth * 4 * 2; ++i) { + orig_argb_pixels[i] = i * 43; + } + +#if defined(HAS_ARGBTOUVMATRIXROW_NEON) + int has_neon = TestCpuFlag(kCpuHasNEON); + if (has_neon) { + for (int width = 1; width <= kMaxWidth; ++width) { + for (int height = 1; height <= 2; ++height) { + memset(dest_u_c, 0, sizeof(dest_u_c)); + memset(dest_v_c, 0, sizeof(dest_v_c)); + memset(dest_u_opt, 0, sizeof(dest_u_opt)); + memset(dest_v_opt, 0, sizeof(dest_v_opt)); + + int src_stride = (height == 1) ? 0 : kMaxWidth * 4; + + ARGBToUVMatrixRow_C(&orig_argb_pixels[0], src_stride, &dest_u_c[0], &dest_v_c[0], width, &kArgbI601Constants); + ARGBToUVMatrixRow_Any_NEON(&orig_argb_pixels[0], src_stride, &dest_u_opt[0], &dest_v_opt[0], width, &kArgbI601Constants); + + int half_width = (width + 1) / 2; + for (int i = 0; i < half_width; ++i) { + EXPECT_EQ(dest_u_c[i], dest_u_opt[i]) << "u mismatch at " << i << " width " << width << " height " << height; + EXPECT_EQ(dest_v_c[i], dest_v_opt[i]) << "v mismatch at " << i << " width " << width << " height " << height; + } + } + } + } +#endif +} +#endif + #if !defined(DISABLE_SLOW_TESTS) && \ (defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__)) // TODO(fbarchard): Consider _set_new_mode(0) to make malloc return NULL