mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
Gauss unittest, Scale comments for neon64 half size updated
[ RUN ] LibYUVPlanarTest.TestGaussRow_Opt [ OK ] LibYUVPlanarTest.TestGaussRow_Opt (1274 ms) [ RUN ] LibYUVPlanarTest.TestGaussCol_Opt [ OK ] LibYUVPlanarTest.TestGaussCol_Opt (916 ms) TBR=kjellander@chromium.org BUG=libyuv:719 TEST=LibYUVPlanarTest.TestGaussRow_Opt Change-Id: Id480f3870c40c2b40dfb9f072cb7118ebad41afc Reviewed-on: https://chromium-review.googlesource.com/624701 Reviewed-by: Cheng Wang <wangcheng@google.com>
This commit is contained in:
parent
0c957d183e
commit
c5bad809b1
@ -2693,16 +2693,14 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
|
||||
}
|
||||
|
||||
static vec16 kGauseCoefficients[4] = {
|
||||
{1, 4, 6, 4, 1, 0, 0, 0},
|
||||
{0, 1, 4, 6, 4, 1, 0, 0},
|
||||
{0, 0, 1, 4, 6, 4, 1, 0},
|
||||
{0, 0, 0, 1, 4, 6, 4, 1},
|
||||
{1, 4, 6, 4, 1, 0, 0, 0},
|
||||
{0, 1, 4, 6, 4, 1, 0, 0},
|
||||
{0, 0, 1, 4, 6, 4, 1, 0},
|
||||
{0, 0, 0, 1, 4, 6, 4, 1},
|
||||
};
|
||||
|
||||
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||
void GaussRow_NEON(const uint16* src0,
|
||||
uint16* dst,
|
||||
int width) {
|
||||
void GaussRow_NEON(const uint16* src0, uint16* dst, int width) {
|
||||
asm volatile(
|
||||
"ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [%3] \n"
|
||||
|
||||
@ -2729,13 +2727,13 @@ void GaussRow_NEON(const uint16* src0,
|
||||
"st4 {v1.s,v2.s,v3.s,v4.s}[0], [%1], #16 \n" // store 4 samples
|
||||
"b.gt 1b \n"
|
||||
|
||||
: "+r"(src0), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(&kGauseCoefficients[0]) // %3
|
||||
"r"(8LL) // %4
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4",
|
||||
"v20", "v21", "v22", "v23");
|
||||
: "+r"(src0), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(&kGauseCoefficients[0]), // %3
|
||||
"r"(8LL) // %4
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v20", "v21", "v22",
|
||||
"v23");
|
||||
}
|
||||
|
||||
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||
|
||||
@ -1009,11 +1009,9 @@ void ScaleRowDown2Box_16_NEON(const uint16* src_ptr,
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop
|
||||
"uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
|
||||
"uaddlp v1.4s, v1.8h \n"
|
||||
"uadalp v0.4s, v2.8h \n" // row 2 add adjacent +
|
||||
// row1
|
||||
"uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
|
||||
"uadalp v1.4s, v3.8h \n"
|
||||
"rshrn v0.4h, v0.4s, #2 \n" // downshift, round and
|
||||
// pack
|
||||
"rshrn v0.4h, v0.4s, #2 \n" // round and pack
|
||||
"rshrn2 v0.8h, v1.4s, #2 \n"
|
||||
"st1 {v0.8h}, [%2], #16 \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
@ -2725,4 +2725,97 @@ TEST_F(LibYUVPlanarTest, TestScaleSamples_Opt) {
|
||||
EXPECT_EQ(0, diff);
|
||||
}
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
extern "C" void GaussRow_NEON(const uint16* src0, uint32* dst, int width);
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
|
||||
SIMD_ALIGNED(uint16 orig_pixels[1280 + 4]);
|
||||
SIMD_ALIGNED(uint32 dst_pixels_c[1280]);
|
||||
SIMD_ALIGNED(uint32 dst_pixels_opt[1280]);
|
||||
|
||||
memset(orig_pixels, 0, sizeof(orig_pixels));
|
||||
memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
|
||||
memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
|
||||
|
||||
for (int i = 0; i < 1280 + 4; ++i) {
|
||||
orig_pixels[i] = i;
|
||||
}
|
||||
GaussRow_NEON(&orig_pixels[0], &dst_pixels_c[0], 1280);
|
||||
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
int has_neon = TestCpuFlag(kCpuHasNEON);
|
||||
if (has_neon) {
|
||||
GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
|
||||
} else {
|
||||
GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
|
||||
}
|
||||
#else
|
||||
GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int i = 0; i < 1280; ++i) {
|
||||
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
||||
}
|
||||
|
||||
EXPECT_EQ(dst_pixels_c[0], 0 * 1 + 1 * 4 + 2 * 6 + 3 * 4 + 4 * 1);
|
||||
EXPECT_EQ(dst_pixels_c[1279],
|
||||
1279 * 1 + 1280 * 4 + 1281 * 6 + 1282 * 4 + 1283 * 1);
|
||||
}
|
||||
|
||||
extern "C" void GaussCol_NEON(const uint32* src0,
|
||||
const uint32* src1,
|
||||
const uint32* src2,
|
||||
const uint32* src3,
|
||||
const uint32* src4,
|
||||
uint16* dst,
|
||||
int width);
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
|
||||
SIMD_ALIGNED(uint32 orig_pixels[1280 * 5]);
|
||||
SIMD_ALIGNED(uint16 dst_pixels_c[1280]);
|
||||
SIMD_ALIGNED(uint16 dst_pixels_opt[1280]);
|
||||
|
||||
memset(orig_pixels, 0, sizeof(orig_pixels));
|
||||
memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
|
||||
memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
|
||||
|
||||
for (int i = 0; i < 1280 * 5; ++i) {
|
||||
orig_pixels[i] = i;
|
||||
}
|
||||
GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
|
||||
&dst_pixels_c[0], 1280);
|
||||
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
int has_neon = TestCpuFlag(kCpuHasNEON);
|
||||
if (has_neon) {
|
||||
GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
|
||||
&dst_pixels_opt[0], 1280);
|
||||
} else {
|
||||
GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
|
||||
&dst_pixels_opt[0], 1280);
|
||||
}
|
||||
#else
|
||||
GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
|
||||
&dst_pixels_opt[0], 1280);
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int i = 0; i < 1280; ++i) {
|
||||
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
||||
}
|
||||
|
||||
EXPECT_EQ(dst_pixels_c[0], (0 * 1 + 1280 * 4 + 1280 * 2 * 6 + 1280 * 3 * 4 +
|
||||
1280 * 4 * 1 + 128) /
|
||||
256);
|
||||
EXPECT_EQ(dst_pixels_c[1279], 239);
|
||||
}
|
||||
|
||||
#endif // aarch64
|
||||
|
||||
} // namespace libyuv
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user