mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 09:47:45 +08:00
HalfFloat Neon for ARMv7.
64 bit version made similar to 32 bit with registers 1 for load and store results, and 2 and 3 as expanded float temporary values. TEST=out/Release/libyuv_unittest --gtest_filter=*Half* BUG=libyuv:560 R=wangcheng@google.com Review URL: https://codereview.chromium.org/2467723002 .
This commit is contained in:
parent
10ce829bad
commit
eca08525cb
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1632
|
||||
Version: 1633
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -288,6 +288,7 @@ extern "C" {
|
||||
#define HAS_BGRATOUVROW_NEON
|
||||
#define HAS_BGRATOYROW_NEON
|
||||
#define HAS_COPYROW_NEON
|
||||
#define HAS_HALFFLOATROW_NEON
|
||||
#define HAS_I400TOARGBROW_NEON
|
||||
#define HAS_I422ALPHATOARGBROW_NEON
|
||||
#define HAS_I422TOARGB1555ROW_NEON
|
||||
@ -329,11 +330,6 @@ extern "C" {
|
||||
#define HAS_YUY2TOUVROW_NEON
|
||||
#define HAS_YUY2TOYROW_NEON
|
||||
|
||||
// TODO(fbarchard): Port to 32 bit.
|
||||
#if defined(__aarch64__)
|
||||
#define HAS_HALFFLOATROW_NEON
|
||||
#endif
|
||||
|
||||
// Effects:
|
||||
#define HAS_ARGBADDROW_NEON
|
||||
#define HAS_ARGBATTENUATEROW_NEON
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1632
|
||||
#define LIBYUV_VERSION 1633
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -2733,7 +2733,63 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
|
||||
: "cc", "memory", "q0", "q1" // Clobber List
|
||||
);
|
||||
}
|
||||
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
|
||||
|
||||
void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
|
||||
asm volatile (
|
||||
"vdup.32 q0, %3 \n"
|
||||
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
|
||||
"subs %2, %2, #8 \n" // 8 pixels per loop
|
||||
"vmovl.u8 q2, d2 \n" // 8 int's
|
||||
"vmovl.u8 q3, d3 \n"
|
||||
"vcvt.f32.u32 q2, q2 \n" // 8 floats
|
||||
"vcvt.f32.u32 q3, q3 \n"
|
||||
"vmul.f32 q2, q2, q0 \n" // adjust exponent
|
||||
"vmul.f32 q3, q3, q0 \n"
|
||||
"vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
|
||||
"vqshrn.u32 d3, q3, #13 \n"
|
||||
MEMACCESS(1)
|
||||
"vst1.8 {q1}, [%0]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(1.9259299444e-34f) // %3
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3"
|
||||
);
|
||||
}
|
||||
|
||||
// TODO(fbarchard): multiply by element.
|
||||
void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
|
||||
asm volatile (
|
||||
"vdup.32 q0, %3 \n"
|
||||
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
|
||||
"subs %2, %2, #8 \n" // 8 pixels per loop
|
||||
"vmovl.u8 q2, d2 \n" // 8 int's
|
||||
"vmovl.u8 q3, d3 \n"
|
||||
"vcvt.f32.u32 q2, q2 \n" // 8 floats
|
||||
"vcvt.f32.u32 q3, q3 \n"
|
||||
"vmul.f32 q2, q2, q0 \n" // adjust exponent
|
||||
"vmul.f32 q3, q3, q0 \n"
|
||||
"vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
|
||||
"vqshrn.u32 d3, q3, #13 \n"
|
||||
MEMACCESS(1)
|
||||
"vst1.8 {q1}, [%0]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(scale * 1.9259299444e-34f) // %3
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3"
|
||||
);
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
|
||||
@ -2718,19 +2718,19 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
|
||||
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
|
||||
"subs %w2, %w2, #8 \n" // 8 pixels per loop
|
||||
"uxtl v2.4s, v1.4h \n" // 8 int's
|
||||
"uxtl2 v1.4s, v1.8h \n"
|
||||
"uxtl2 v3.4s, v1.8h \n"
|
||||
"scvtf v2.4s, v2.4s \n" // 8 floats
|
||||
"scvtf v1.4s, v1.4s \n"
|
||||
"fcvtn v4.4h, v2.4s \n" // 8 floatsgit
|
||||
"fcvtn2 v4.8h, v1.4s \n"
|
||||
"scvtf v3.4s, v3.4s \n"
|
||||
"fcvtn v1.4h, v2.4s \n" // 8 floatsgit
|
||||
"fcvtn2 v1.8h, v3.4s \n"
|
||||
MEMACCESS(1)
|
||||
"st1 {v4.16b}, [%1], #16 \n" // store 8 shorts
|
||||
"st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
:
|
||||
: "cc", "memory", "v1", "v2", "v4"
|
||||
: "cc", "memory", "v1", "v2", "v3"
|
||||
);
|
||||
}
|
||||
|
||||
@ -2741,21 +2741,21 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
|
||||
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
|
||||
"subs %w2, %w2, #8 \n" // 8 pixels per loop
|
||||
"uxtl v2.4s, v1.4h \n" // 8 int's
|
||||
"uxtl2 v1.4s, v1.8h \n"
|
||||
"uxtl2 v3.4s, v1.8h \n"
|
||||
"scvtf v2.4s, v2.4s \n" // 8 floats
|
||||
"scvtf v1.4s, v1.4s \n"
|
||||
"scvtf v3.4s, v3.4s \n"
|
||||
"fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
|
||||
"fmul v1.4s, v1.4s, %3.s[0] \n"
|
||||
"uqshrn v4.4h, v2.4s, #13 \n" // isolate halffloat
|
||||
"uqshrn2 v4.8h, v1.4s, #13 \n"
|
||||
"fmul v3.4s, v3.4s, %3.s[0] \n"
|
||||
"uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
|
||||
"uqshrn2 v1.8h, v3.4s, #13 \n"
|
||||
MEMACCESS(1)
|
||||
"st1 {v4.16b}, [%1], #16 \n" // store 8 shorts
|
||||
"st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "w"(scale * 1.9259299444e-34f) // %3
|
||||
: "cc", "memory", "v1", "v2", "v4"
|
||||
: "cc", "memory", "v1", "v2", "v3"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user