mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
ScaleRowUp2_16_C port of NEON to C
Single pass upsample with bilinear filter. NEON version optimized - Pixel Sailfish QC821 Was TestScaleRowUp2_16 (5741 ms) Now TestScaleRowUp2_16 (4484 ms) C TestScaleRowUp2_16 (6555 ms) TBR=kjellander@chromium.org BUG=libyuv:718 TEST=LibYUVScaleTest.TestScaleRowUp2_16 (709 ms) Change-Id: Ib04ceb53e0ab644a392c39c3396e313530161d92 Reviewed-on: https://chromium-review.googlesource.com/646701 Reviewed-by: Frank Barchard <fbarchard@google.com> Reviewed-by: Cheng Wang <wangcheng@google.com>
This commit is contained in:
parent
2621c91bf1
commit
8f5e9cd9eb
@ -1306,6 +1306,35 @@ void ScaleSlope(int src_width,
|
||||
}
|
||||
#undef CENTERSTART
|
||||
|
||||
// Read 8x2 upsample with filtering and write 16x1.
|
||||
// actually reads an extra pixel, so 9x2.
|
||||
void ScaleRowUp2_16_C(const uint16* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16* dst,
|
||||
int dst_width) {
|
||||
const uint16* src2 = src_ptr + src_stride;
|
||||
|
||||
int x;
|
||||
for (x = 0; x < dst_width - 1; x += 2) {
|
||||
uint16 p0 = src_ptr[0];
|
||||
uint16 p1 = src_ptr[1];
|
||||
uint16 p2 = src2[0];
|
||||
uint16 p3 = src2[1];
|
||||
dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
|
||||
dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4;
|
||||
++src_ptr;
|
||||
++src2;
|
||||
dst += 2;
|
||||
}
|
||||
if (dst_width & 1) {
|
||||
uint16 p0 = src_ptr[0];
|
||||
uint16 p1 = src_ptr[1];
|
||||
uint16 p2 = src2[0];
|
||||
uint16 p3 = src2[1];
|
||||
dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -1010,66 +1010,43 @@ void ScaleRowDown2Box_16_NEON(const uint16* src_ptr,
|
||||
}
|
||||
|
||||
// Read 8x2 upsample with filtering and write 16x1.
|
||||
// actually reads an extra pixel, so 9x2.
|
||||
// Actually reads an extra pixel, so 9x2.
|
||||
void ScaleRowUp2_16_NEON(const uint16* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16* dst,
|
||||
int dst_width) {
|
||||
asm volatile(
|
||||
"add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
|
||||
"movi v20.4h, #1 \n"
|
||||
"movi v21.4h, #3 \n" // constants
|
||||
"movi v22.4h, #9 \n"
|
||||
"movi v0.8h, #9 \n" // constants
|
||||
"movi v1.4s, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
"ld2 {v0.4h, v1.4h}, [%0], %4 \n" // load row 1 even pixels
|
||||
"ld2 {v2.4h, v3.4h}, [%1], %4 \n" // load row 2
|
||||
|
||||
// consider a variation of this for last 8x2 that replicates the last
|
||||
// pixel.
|
||||
"ld2 {v4.4h, v5.4h}, [%0], %5 \n" // load row 1 odd pixels
|
||||
"ld2 {v6.4h, v7.4h}, [%1], %5 \n" // load row 2
|
||||
|
||||
"ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
|
||||
"ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
|
||||
"ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
|
||||
"ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
|
||||
"subs %w3, %w3, #16 \n" // 16 dst pixels per loop
|
||||
|
||||
// filter first 2x2 group to produce 1st and 4th dest pixels
|
||||
// 9 3
|
||||
// 3 1
|
||||
"umull v8.4s, v0.4h, v22.4h \n"
|
||||
"umlal v8.4s, v1.4h, v21.4h \n"
|
||||
"umlal v8.4s, v2.4h, v21.4h \n"
|
||||
"umlal v8.4s, v3.4h, v20.4h \n"
|
||||
|
||||
// filter first 2x2 group to produce 2nd and 5th dest pixel
|
||||
// 3 9
|
||||
// 1 3
|
||||
"umull v9.4s, v0.4h, v21.4h \n"
|
||||
"umlal v9.4s, v1.4h, v22.4h \n"
|
||||
"umlal v9.4s, v2.4h, v20.4h \n"
|
||||
"umlal v9.4s, v3.4h, v21.4h \n"
|
||||
|
||||
// filter second 2x2 group to produce 3rd and 6th dest pixels
|
||||
// 9 3
|
||||
// 3 1
|
||||
"umull v10.4s, v4.4h, v22.4h \n"
|
||||
"umlal v10.4s, v5.4h, v21.4h \n"
|
||||
"umlal v10.4s, v6.4h, v21.4h \n"
|
||||
"umlal v10.4s, v7.4h, v20.4h \n"
|
||||
|
||||
// filter second 2x2 group to produce 4th and 7th dest pixel
|
||||
// 3 9
|
||||
// 1 3
|
||||
"umull v11.4s, v4.4h, v21.4h \n"
|
||||
"umlal v11.4s, v5.4h, v22.4h \n"
|
||||
"umlal v11.4s, v6.4h, v20.4h \n"
|
||||
"umlal v11.4s, v7.4h, v21.4h \n"
|
||||
|
||||
"uqrshrn v8.4h, v8.4s, #4 \n" // downshift, round
|
||||
"uqrshrn v9.4h, v9.4s, #4 \n"
|
||||
"uqrshrn v10.4h, v10.4s, #4 \n"
|
||||
"uqrshrn v11.4h, v11.4s, #4 \n"
|
||||
|
||||
"st4 {v8.4h, v9.4h, v10.4h, v11.4h}, [%2], #32 \n"
|
||||
"umull v16.4s, v3.4h, v0.4h \n"
|
||||
"umull2 v7.4s, v3.8h, v0.8h \n"
|
||||
"umull v18.4s, v4.4h, v0.4h \n"
|
||||
"umull2 v17.4s, v4.8h, v0.8h \n"
|
||||
"uaddw v16.4s, v16.4s, v6.4h \n"
|
||||
"uaddl2 v19.4s, v6.8h, v3.8h \n"
|
||||
"uaddl v3.4s, v6.4h, v3.4h \n"
|
||||
"uaddw2 v6.4s, v7.4s, v6.8h \n"
|
||||
"uaddl2 v7.4s, v5.8h, v4.8h \n"
|
||||
"uaddl v4.4s, v5.4h, v4.4h \n"
|
||||
"uaddw v18.4s, v18.4s, v5.4h \n"
|
||||
"mla v16.4s, v4.4s, v1.4s \n"
|
||||
"mla v18.4s, v3.4s, v1.4s \n"
|
||||
"mla v6.4s, v7.4s, v1.4s \n"
|
||||
"uaddw2 v4.4s, v17.4s, v5.8h \n"
|
||||
"uqrshrn v16.4h, v16.4s, #4 \n"
|
||||
"mla v4.4s, v19.4s, v1.4s \n"
|
||||
"uqrshrn2 v16.8h, v6.4s, #4 \n"
|
||||
"uqrshrn v17.4h, v18.4s, #4 \n"
|
||||
"uqrshrn2 v17.8h, v4.4s, #4 \n"
|
||||
"st2 {v16.8h-v17.8h}, [%2], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_stride), // %1
|
||||
@ -1077,9 +1054,8 @@ void ScaleRowUp2_16_NEON(const uint16* src_ptr,
|
||||
"+r"(dst_width) // %3
|
||||
: "r"(2LL), // %4
|
||||
"r"(14LL) // %5
|
||||
|
||||
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
||||
"v11", "v20", "v21", "v22" // Clobber List
|
||||
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
|
||||
"v19" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@ -450,12 +450,14 @@ TEST_F(LibYUVScaleTest, TestScaleOdd) {
|
||||
}
|
||||
#endif // HAS_SCALEROWDOWN2_SSSE3
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
extern "C" void ScaleRowUp2_16_NEON(const uint16* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16* dst,
|
||||
int dst_width);
|
||||
extern "C" void ScaleRowUp2_16_C(const uint16* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16* dst,
|
||||
int dst_width);
|
||||
|
||||
TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) {
|
||||
SIMD_ALIGNED(uint16 orig_pixels[640 * 2 + 1]); // 2 rows + 1 pixel overrun
|
||||
@ -469,10 +471,19 @@ TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) {
|
||||
for (int i = 0; i < 640 * 2 + 1; ++i) {
|
||||
orig_pixels[i] = i;
|
||||
}
|
||||
ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_c[0], 1280);
|
||||
|
||||
ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_c[0], 1280);
|
||||
MaskCpuFlags(benchmark_cpu_info_);
|
||||
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
|
||||
ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
int has_neon = TestCpuFlag(kCpuHasNEON);
|
||||
if (has_neon) {
|
||||
ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
|
||||
} else {
|
||||
ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
|
||||
}
|
||||
#else
|
||||
ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int i = 0; i < 1280; ++i) {
|
||||
@ -481,7 +492,6 @@ TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) {
|
||||
EXPECT_EQ(dst_pixels_c[0], (0 * 9 + 1 * 3 + 640 * 3 + 641 * 1 + 8) / 16);
|
||||
EXPECT_EQ(dst_pixels_c[1279], 800);
|
||||
}
|
||||
#endif
|
||||
|
||||
extern "C" void ScaleRowDown2Box_16_NEON(const uint16* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
@ -501,6 +511,7 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
|
||||
orig_pixels[i] = i;
|
||||
}
|
||||
ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_c[0], 1280);
|
||||
MaskCpuFlags(benchmark_cpu_info_);
|
||||
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
int has_neon = TestCpuFlag(kCpuHasNEON);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user