Scale by even factor low level row function

Bug: b/171884264
Change-Id: I6a94bde0aa05e681bb4590ea8beec33a61ddbfc9
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2518361
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2020-11-03 11:25:56 -08:00 committed by Commit Bot
parent f014dbd87a
commit b7a1c5ee5d
17 changed files with 7698 additions and 7554 deletions

View File

@ -113,6 +113,7 @@ extern "C" {
#define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEROWDOWN4_NEON #define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEUVROWDOWN2BOX_NEON #define HAS_SCALEUVROWDOWN2BOX_NEON
#define HAS_SCALEUVROWDOWNEVEN_NEON
#endif #endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

View File

@ -266,8 +266,10 @@ void TransposeUVWx8_NEON(const uint8_t* src,
"vst1.8 {d21}, [%0] \n" "vst1.8 {d21}, [%0] \n"
"add %1, #8*2 \n" // src += 8*2 "add %1, #8*2 \n" // src += 8*2
"add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a "add %3, %3, %4, lsl #3 \n" // dst_a += 8 *
"add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b // dst_stride_a
"add %5, %5, %6, lsl #3 \n" // dst_b += 8 *
// dst_stride_b
"subs %7, #8 \n" // w -= 8 "subs %7, #8 \n" // w -= 8
"bge 1b \n" "bge 1b \n"

View File

@ -6937,7 +6937,8 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
"vmovdqu %5, %%ymm0 \n" // init blend value "vmovdqu %5, %%ymm0 \n" // init blend value
"vmovdqu %6, %%ymm1 \n" // init blend value "vmovdqu %6, %%ymm1 \n" // init blend value
"vmovdqu %7, %%ymm2 \n" // init blend value "vmovdqu %7, %%ymm2 \n" // init blend value
// "sub $0x20, %3 \n" //sub 32 from width for final loop // "sub $0x20, %3 \n" //sub 32 from
// width for final loop
LABELALIGN LABELALIGN
"1: \n" // label 1 "1: \n" // label 1

View File

@ -906,7 +906,8 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
asm volatile( asm volatile(
"movi v4.8b, #255 \n" // Alpha "movi v4.8b, #255 \n" // Alpha
"1: \n" "1: \n"
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of
// RGB24.
"prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
@ -1106,7 +1107,8 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
"prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
// RGB24
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1 "+r"(dst_rgb24), // %1
@ -1339,7 +1341,8 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
// pixels
"prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTORGB565 ARGBTORGB565
@ -1359,7 +1362,8 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
asm volatile( asm volatile(
"dup v1.4s, %w2 \n" // dither4 "dup v1.4s, %w2 \n" // dither4
"1: \n" "1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8
// pixels
"prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%0, 448] \n"
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v20.8b, v20.8b, v1.8b \n" "uqadd v20.8b, v20.8b, v1.8b \n"
@ -1379,7 +1383,8 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
// pixels
"prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555 ARGBTOARGB1555
@ -1399,7 +1404,8 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
"movi v4.16b, #0x0f \n" // bits to clear with "movi v4.16b, #0x0f \n" // bits to clear with
// vbic. // vbic.
"1: \n" "1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
// pixels
"prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444 ARGBTOARGB4444
@ -2375,8 +2381,10 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
// Blend 1 pixels. // Blend 1 pixels.
"1: \n" "1: \n"
"ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel
"ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. // ARGB0.
"ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel
// ARGB1.
"prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n" "prfm pldl1keep, [%1, 448] \n"
"subs %w3, %w3, #1 \n" // 1 processed per loop. "subs %w3, %w3, #1 \n" // 1 processed per loop.
@ -3183,7 +3191,8 @@ void GaussRow_F32_NEON(const float* src, float* dst, int width) {
"ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256 "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256
"1: \n" "1: \n"
"ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4\n" // load 12 samples, 5 rows "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5
// rows
"fadd v0.4s, v0.4s, v1.4s \n" // * 1 "fadd v0.4s, v0.4s, v1.4s \n" // * 1
"ld1 {v4.4s, v5.4s}, [%0], %5 \n" "ld1 {v4.4s, v5.4s}, [%0], %5 \n"
"fadd v1.4s, v1.4s, v2.4s \n" "fadd v1.4s, v1.4s, v2.4s \n"

View File

@ -490,6 +490,13 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MMI,
4, 4,
1) 1)
#endif #endif
#ifdef HAS_SCALEUVROWDOWNEVEN_NEON
SDAANY(ScaleUVRowDownEven_Any_NEON,
ScaleUVRowDownEven_NEON,
ScaleUVRowDownEven_C,
2,
3)
#endif
#ifdef SASIMDONLY #ifdef SASIMDONLY
// This also works and uses memcpy and SIMD instead of C, but is slower on ARM // This also works and uses memcpy and SIMD instead of C, but is slower on ARM

View File

@ -1323,7 +1323,8 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
"packuswb %%xmm0,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n"
"movd %%xmm0,(%0) \n" "movd %%xmm0,(%0) \n"
LABELALIGN "99: \n" // clang-format error. LABELALIGN
"99: \n" // clang-format error.
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(src_argb), // %1 "+r"(src_argb), // %1

View File

@ -979,6 +979,35 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
: "memory", "cc", "q0", "q1", "q8", "q9"); : "memory", "cc", "q0", "q1", "q8", "q9");
} }
// Reads 4 pixels at a time.
void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int src_stepx, // pixel step
uint8_t* dst_ptr,
int dst_width) {
const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
(void)src_stride;
asm volatile(
"1: \n"
"vld1.16 {d0[0]}, [%0], %6 \n"
"vld1.16 {d0[1]}, [%1], %6 \n"
"vld1.16 {d0[2]}, [%2], %6 \n"
"vld1.16 {d0[3]}, [%3], %6 \n"
"subs %5, %5, #4 \n" // 4 pixels per loop.
"vst1.8 {d0}, [%4]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src1_ptr), // %1
"+r"(src2_ptr), // %2
"+r"(src3_ptr), // %3
"+r"(dst_ptr), // %4
"+r"(dst_width) // %5
: "r"(src_stepx * 8) // %6
: "memory", "cc", "d0");
}
#endif // defined(__ARM_NEON__) && !defined(__aarch64__) #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -1115,6 +1115,35 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
: "memory", "cc", "v0", "v1", "v16", "v17"); : "memory", "cc", "v0", "v1", "v16", "v17");
} }
// Reads 4 pixels at a time.
void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int src_stepx, // pixel step
uint8_t* dst_ptr,
int dst_width) {
const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
(void)src_stride;
asm volatile(
"1: \n"
"ld1 {v0.h}[0], [%0], %6 \n"
"ld1 {v1.h}[0], [%1], %6 \n"
"ld1 {v2.h}[0], [%2], %6 \n"
"ld1 {v3.h}[0], [%3], %6 \n"
"subs %w5, %w5, #4 \n" // 4 pixels per loop.
"st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src1_ptr), // %1
"+r"(src2_ptr), // %2
"+r"(src3_ptr), // %3
"+r"(dst_ptr), // %4
"+r"(dst_width) // %5
: "r"((int64_t)(src_stepx * 8)) // %6
: "memory", "cc", "v0", "v1", "v2", "v3");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -299,6 +299,14 @@ static void ScaleUVDownEven(int src_width,
} }
#endif #endif
#if defined(HAS_SCALEUVROWDOWNEVEN_NEON) #if defined(HAS_SCALEUVROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON) && !filtering) {
ScaleUVRowDownEven = ScaleUVRowDownEven_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleUVRowDownEven = ScaleUVRowDownEven_NEON;
}
}
#endif// TODO(fbarchard): Enable Box filter
#if defined(HAS_SCALEUVROWDOWNEVENBOX_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON
: ScaleUVRowDownEven_Any_NEON; : ScaleUVRowDownEven_Any_NEON;

View File

@ -1052,4 +1052,61 @@ TEST_FACTOR(3, 1, 3, 0)
#undef TEST_FACTOR #undef TEST_FACTOR
#undef SX #undef SX
#undef DX #undef DX
TEST_F(LibYUVScaleTest, PlaneTest3x) {
const int kSrcStride = 48;
const int kDstStride = 16;
const int kSize = kSrcStride * 3;
align_buffer_page_end(orig_pixels, kSize);
for (int i = 0; i < 48 * 3; ++i) {
orig_pixels[i] = i;
}
align_buffer_page_end(dest_pixels, kDstStride);
int iterations16 =
benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_;
for (int i = 0; i < iterations16; ++i) {
ScalePlane(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1,
kFilterBilinear);
}
EXPECT_EQ(49, dest_pixels[0]);
ScalePlane(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1,
kFilterNone);
EXPECT_EQ(49, dest_pixels[0]);
free_aligned_buffer_page_end(dest_pixels);
free_aligned_buffer_page_end(orig_pixels);
}
TEST_F(LibYUVScaleTest, PlaneTest4x) {
const int kSrcStride = 64;
const int kDstStride = 16;
const int kSize = kSrcStride * 4;
align_buffer_page_end(orig_pixels, kSize);
for (int i = 0; i < 64 * 4; ++i) {
orig_pixels[i] = i;
}
align_buffer_page_end(dest_pixels, kDstStride);
int iterations16 =
benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_;
for (int i = 0; i < iterations16; ++i) {
ScalePlane(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
kFilterBilinear);
}
EXPECT_EQ((65 + 66 + 129 + 130 + 2) / 4, dest_pixels[0]);
ScalePlane(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
kFilterNone);
EXPECT_EQ(130, dest_pixels[0]); // expect the 3rd pixel of the 3rd row
free_aligned_buffer_page_end(dest_pixels);
free_aligned_buffer_page_end(orig_pixels);
}
} // namespace libyuv } // namespace libyuv