mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
GaussCol_NEON resample from short to int
Old NEON LibYUVPlanarTest.TestGaussCol_Opt (916 ms) New NEON LibYUVPlanarTest.TestGaussCol_Opt (520 ms) C vectorized LibYUVPlanarTest.TestGaussCol_Opt (739 ms) TBR=kjellander@chromium.org BUG=libyuv:719 TEST=LibYUVPlanarTest.TestGaussCol_Opt Change-Id: I863b66f700f7a71fcb08a2eabb03240fdaf8a238 Reviewed-on: https://chromium-review.googlesource.com/626938 Reviewed-by: Cheng Wang <wangcheng@google.com>
This commit is contained in:
parent
c5bad809b1
commit
1cc539f7d6
@ -2672,6 +2672,20 @@ void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||||
|
void GaussCol_C(const uint16* src0,
|
||||||
|
const uint16* src1,
|
||||||
|
const uint16* src2,
|
||||||
|
const uint16* src3,
|
||||||
|
const uint16* src4,
|
||||||
|
uint32* dst,
|
||||||
|
int width) {
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < width; ++i) {
|
||||||
|
*dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
} // namespace libyuv
|
} // namespace libyuv
|
||||||
|
|||||||
@ -2699,6 +2699,50 @@ static vec16 kGauseCoefficients[4] = {
|
|||||||
{0, 0, 0, 1, 4, 6, 4, 1},
|
{0, 0, 0, 1, 4, 6, 4, 1},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||||
|
void GaussCol_NEON(const uint16* src0,
|
||||||
|
const uint16* src1,
|
||||||
|
const uint16* src2,
|
||||||
|
const uint16* src3,
|
||||||
|
const uint16* src4,
|
||||||
|
uint32* dst,
|
||||||
|
int width) {
|
||||||
|
asm volatile(
|
||||||
|
"movi v6.8h, #4 \n" // constant 4
|
||||||
|
"movi v7.8h, #6 \n" // constant 6
|
||||||
|
|
||||||
|
"1: \n"
|
||||||
|
"ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
|
||||||
|
"ld1 {v2.8h}, [%1], #16 \n"
|
||||||
|
"ld1 {v3.8h}, [%2], #16 \n"
|
||||||
|
"ld1 {v4.8h}, [%3], #16 \n"
|
||||||
|
"ld1 {v5.8h}, [%4], #16 \n"
|
||||||
|
"subs %w6, %w6, #8 \n" // 8 processed per loop
|
||||||
|
|
||||||
|
"uaddl v0.4s, v1.4h, v5.4h \n" // * 1
|
||||||
|
"uaddl2 v1.4s, v1.8h, v5.8h \n" // * 1
|
||||||
|
|
||||||
|
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
|
||||||
|
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
|
||||||
|
"umlal v0.4s, v3.4h, v7.4h \n" // * 6
|
||||||
|
"umlal2 v1.4s, v3.8h, v7.8h \n" // * 6
|
||||||
|
"umlal v0.4s, v4.4h, v6.4h \n" // * 4
|
||||||
|
"umlal2 v1.4s, v4.8h, v6.8h \n" // * 4
|
||||||
|
|
||||||
|
"st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
|
||||||
|
"b.gt 1b \n"
|
||||||
|
|
||||||
|
: "+r"(src0), // %0
|
||||||
|
"+r"(src1), // %1
|
||||||
|
"+r"(src2), // %2
|
||||||
|
"+r"(src3), // %3
|
||||||
|
"+r"(src4), // %4
|
||||||
|
"+r"(dst), // %5
|
||||||
|
"+r"(width) // %6
|
||||||
|
:
|
||||||
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
|
||||||
|
}
|
||||||
|
|
||||||
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||||
void GaussRow_NEON(const uint16* src0, uint16* dst, int width) {
|
void GaussRow_NEON(const uint16* src0, uint16* dst, int width) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
@ -2736,46 +2780,6 @@ void GaussRow_NEON(const uint16* src0, uint16* dst, int width) {
|
|||||||
"v23");
|
"v23");
|
||||||
}
|
}
|
||||||
|
|
||||||
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
|
||||||
void GaussCol_NEON(const uint32* src0,
|
|
||||||
const uint32* src1,
|
|
||||||
const uint32* src2,
|
|
||||||
const uint32* src3,
|
|
||||||
const uint32* src4,
|
|
||||||
uint16* dst,
|
|
||||||
int width) {
|
|
||||||
asm volatile(
|
|
||||||
"movi v5.4s, #4 \n" // constant 4
|
|
||||||
"movi v6.4s, #6 \n" // constant 6
|
|
||||||
|
|
||||||
"1: \n"
|
|
||||||
"ld1 {v0.4s}, [%0], #16 \n" // load 4 samples, 5 rows
|
|
||||||
"ld1 {v1.4s}, [%1], #16 \n"
|
|
||||||
"ld1 {v2.4s}, [%2], #16 \n"
|
|
||||||
"ld1 {v3.4s}, [%3], #16 \n"
|
|
||||||
"ld1 {v4.4s}, [%4], #16 \n"
|
|
||||||
"subs %w6, %w6, #4 \n" // 4 processed per loop
|
|
||||||
|
|
||||||
"add v0.4s, v0.4s, v4.4s \n" // * 1
|
|
||||||
"mla v0.4s, v1.4s, v5.4s \n" // * 4
|
|
||||||
"mla v0.4s, v2.4s, v6.4s \n" // * 6
|
|
||||||
"mla v0.4s, v3.4s, v5.4s \n" // * 4
|
|
||||||
|
|
||||||
"uqshrn v0.4h, v0.4s, #8 \n" // round, shift by 8 pack.
|
|
||||||
"st1 {v0.4h}, [%5], #8 \n" // store 4 samples
|
|
||||||
"b.gt 1b \n"
|
|
||||||
|
|
||||||
: "+r"(src0), // %0
|
|
||||||
"+r"(src1), // %1
|
|
||||||
"+r"(src2), // %2
|
|
||||||
"+r"(src3), // %3
|
|
||||||
"+r"(src4), // %4
|
|
||||||
"+r"(dst), // %5
|
|
||||||
"+r"(width) // %6
|
|
||||||
:
|
|
||||||
: "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|||||||
@ -2742,6 +2742,7 @@ TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
|
|||||||
orig_pixels[i] = i;
|
orig_pixels[i] = i;
|
||||||
}
|
}
|
||||||
GaussRow_NEON(&orig_pixels[0], &dst_pixels_c[0], 1280);
|
GaussRow_NEON(&orig_pixels[0], &dst_pixels_c[0], 1280);
|
||||||
|
MaskCpuFlags(benchmark_cpu_info_);
|
||||||
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
|
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
|
||||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||||
int has_neon = TestCpuFlag(kCpuHasNEON);
|
int has_neon = TestCpuFlag(kCpuHasNEON);
|
||||||
@ -2760,22 +2761,29 @@ TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EXPECT_EQ(dst_pixels_c[0], 0 * 1 + 1 * 4 + 2 * 6 + 3 * 4 + 4 * 1);
|
EXPECT_EQ(dst_pixels_c[0], 0 * 1 + 1 * 4 + 2 * 6 + 3 * 4 + 4 * 1);
|
||||||
EXPECT_EQ(dst_pixels_c[1279],
|
EXPECT_EQ(dst_pixels_c[1279], 20496);
|
||||||
1279 * 1 + 1280 * 4 + 1281 * 6 + 1282 * 4 + 1283 * 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" void GaussCol_NEON(const uint32* src0,
|
extern "C" void GaussCol_NEON(const uint16* src0,
|
||||||
const uint32* src1,
|
const uint16* src1,
|
||||||
const uint32* src2,
|
const uint16* src2,
|
||||||
const uint32* src3,
|
const uint16* src3,
|
||||||
const uint32* src4,
|
const uint16* src4,
|
||||||
uint16* dst,
|
uint32* dst,
|
||||||
int width);
|
int width);
|
||||||
|
|
||||||
|
extern "C" void GaussCol_C(const uint16* src0,
|
||||||
|
const uint16* src1,
|
||||||
|
const uint16* src2,
|
||||||
|
const uint16* src3,
|
||||||
|
const uint16* src4,
|
||||||
|
uint32* dst,
|
||||||
|
int width);
|
||||||
|
|
||||||
TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
|
TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
|
||||||
SIMD_ALIGNED(uint32 orig_pixels[1280 * 5]);
|
SIMD_ALIGNED(uint16 orig_pixels[1280 * 5]);
|
||||||
SIMD_ALIGNED(uint16 dst_pixels_c[1280]);
|
SIMD_ALIGNED(uint32 dst_pixels_c[1280]);
|
||||||
SIMD_ALIGNED(uint16 dst_pixels_opt[1280]);
|
SIMD_ALIGNED(uint32 dst_pixels_opt[1280]);
|
||||||
|
|
||||||
memset(orig_pixels, 0, sizeof(orig_pixels));
|
memset(orig_pixels, 0, sizeof(orig_pixels));
|
||||||
memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
|
memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
|
||||||
@ -2784,9 +2792,10 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
|
|||||||
for (int i = 0; i < 1280 * 5; ++i) {
|
for (int i = 0; i < 1280 * 5; ++i) {
|
||||||
orig_pixels[i] = i;
|
orig_pixels[i] = i;
|
||||||
}
|
}
|
||||||
GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
|
GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
|
||||||
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
|
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_c[0],
|
||||||
&dst_pixels_c[0], 1280);
|
1280);
|
||||||
|
MaskCpuFlags(benchmark_cpu_info_);
|
||||||
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
|
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
|
||||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||||
int has_neon = TestCpuFlag(kCpuHasNEON);
|
int has_neon = TestCpuFlag(kCpuHasNEON);
|
||||||
@ -2795,14 +2804,14 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
|
|||||||
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
|
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
|
||||||
&dst_pixels_opt[0], 1280);
|
&dst_pixels_opt[0], 1280);
|
||||||
} else {
|
} else {
|
||||||
GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
|
GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
|
||||||
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
|
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
|
||||||
&dst_pixels_opt[0], 1280);
|
&dst_pixels_opt[0], 1280);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
|
GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
|
||||||
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
|
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
|
||||||
&dst_pixels_opt[0], 1280);
|
&dst_pixels_opt[0], 1280);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2810,10 +2819,9 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
|
|||||||
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
EXPECT_EQ(dst_pixels_c[0], (0 * 1 + 1280 * 4 + 1280 * 2 * 6 + 1280 * 3 * 4 +
|
EXPECT_EQ(dst_pixels_c[0],
|
||||||
1280 * 4 * 1 + 128) /
|
0 * 1 + 1280 * 4 + 1280 * 2 * 6 + 1280 * 3 * 4 + 1280 * 4 * 1);
|
||||||
256);
|
EXPECT_EQ(dst_pixels_c[1279], 61424);
|
||||||
EXPECT_EQ(dst_pixels_c[1279], 239);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // aarch64
|
#endif // aarch64
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user