mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Pass float parameters via vector 2 float and "w" for scalar multiply.
Scalar multiply expects a 'd' register. The "w" (float) uses 's' for float and wont work with the multiply in 32 bit (it does in 64 bit). A vector 2 of float passes as 'd' register. A vector 4 of float passes as 'q' register. This change copies the float into the first entry of a vector 2 and passes that. The optimizer removes the extra copy, allowing the single float to use referenced as Test: LibYUVPlanarTest.TestByteToFloat Bug: libyuv:786 Change-Id: I8773c5bae043c7b84e1d1db7fdea6731aa0b1323 Reviewed-on: https://chromium-review.googlesource.com/973984 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Miguel Casas <mcasas@chromium.org>
This commit is contained in:
parent
d8680893ec
commit
4ad33344cf
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 1706
|
Version: 1707
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -103,6 +103,10 @@ ios simulator
|
|||||||
ninja -v -C out/Debug libyuv_unittest
|
ninja -v -C out/Debug libyuv_unittest
|
||||||
ninja -v -C out/Release libyuv_unittest
|
ninja -v -C out/Release libyuv_unittest
|
||||||
|
|
||||||
|
ios disassembly
|
||||||
|
|
||||||
|
otool -tV ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
|
||||||
|
|
||||||
### Android
|
### Android
|
||||||
https://code.google.com/p/chromium/wiki/AndroidBuildInstructions
|
https://code.google.com/p/chromium/wiki/AndroidBuildInstructions
|
||||||
|
|
||||||
@ -144,6 +148,8 @@ arm disassembly:
|
|||||||
|
|
||||||
third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
|
third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
|
||||||
|
|
||||||
|
Caveat: Disassembly may require optimize_max be disabled in BUILD.gn
|
||||||
|
|
||||||
Running tests:
|
Running tests:
|
||||||
|
|
||||||
build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*
|
build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*
|
||||||
|
|||||||
@ -55,9 +55,9 @@ extern "C" {
|
|||||||
#endif // clang >= 3.4
|
#endif // clang >= 3.4
|
||||||
#endif // __clang__
|
#endif // __clang__
|
||||||
|
|
||||||
// clang >= 6.0.0 required for AVX512.
|
// clang >= 7.0.0 required for AVX512.
|
||||||
#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
|
#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
|
||||||
#if (__clang_major__ >= 6)
|
#if (__clang_major__ >= 7)
|
||||||
#define CLANG_HAS_AVX512 1
|
#define CLANG_HAS_AVX512 1
|
||||||
#endif // clang >= 6
|
#endif // clang >= 6
|
||||||
#endif // __clang__
|
#endif // __clang__
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1706
|
#define LIBYUV_VERSION 1707
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|||||||
@ -2604,12 +2604,15 @@ void SobelYRow_NEON(const uint8_t* src_y0,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// %y passes a float as a scalar vector for vector * scalar multiply.
|
||||||
|
// the regoster must be d0 to d15 and indexed with [0] or [1] to access
|
||||||
|
// the float in the first or second float of the d-reg
|
||||||
|
|
||||||
void HalfFloat1Row_NEON(const uint16_t* src,
|
void HalfFloat1Row_NEON(const uint16_t* src,
|
||||||
uint16_t* dst,
|
uint16_t* dst,
|
||||||
float /*unused*/,
|
float /*unused*/,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"vdup.32 q0, %3 \n"
|
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
|
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
|
||||||
@ -2618,8 +2621,8 @@ void HalfFloat1Row_NEON(const uint16_t* src,
|
|||||||
"vmovl.u16 q3, d3 \n"
|
"vmovl.u16 q3, d3 \n"
|
||||||
"vcvt.f32.u32 q2, q2 \n" // 8 floats
|
"vcvt.f32.u32 q2, q2 \n" // 8 floats
|
||||||
"vcvt.f32.u32 q3, q3 \n"
|
"vcvt.f32.u32 q3, q3 \n"
|
||||||
"vmul.f32 q2, q2, q0 \n" // adjust exponent
|
"vmul.f32 q2, q2, %y3 \n" // adjust exponent
|
||||||
"vmul.f32 q3, q3, q0 \n"
|
"vmul.f32 q3, q3, %y3 \n"
|
||||||
"vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
|
"vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
|
||||||
"vqshrn.u32 d3, q3, #13 \n"
|
"vqshrn.u32 d3, q3, #13 \n"
|
||||||
"vst1.8 {q1}, [%1]! \n"
|
"vst1.8 {q1}, [%1]! \n"
|
||||||
@ -2627,17 +2630,15 @@ void HalfFloat1Row_NEON(const uint16_t* src,
|
|||||||
: "+r"(src), // %0
|
: "+r"(src), // %0
|
||||||
"+r"(dst), // %1
|
"+r"(dst), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
: "r"(1.9259299444e-34f) // %3
|
: "w"(1.9259299444e-34f) // %3
|
||||||
: "cc", "memory", "q0", "q1", "q2", "q3");
|
: "cc", "memory", "q1", "q2", "q3");
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(fbarchard): multiply by element.
|
|
||||||
void HalfFloatRow_NEON(const uint16_t* src,
|
void HalfFloatRow_NEON(const uint16_t* src,
|
||||||
uint16_t* dst,
|
uint16_t* dst,
|
||||||
float scale,
|
float scale,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"vdup.32 q0, %3 \n"
|
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
|
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
|
||||||
@ -2646,8 +2647,8 @@ void HalfFloatRow_NEON(const uint16_t* src,
|
|||||||
"vmovl.u16 q3, d3 \n"
|
"vmovl.u16 q3, d3 \n"
|
||||||
"vcvt.f32.u32 q2, q2 \n" // 8 floats
|
"vcvt.f32.u32 q2, q2 \n" // 8 floats
|
||||||
"vcvt.f32.u32 q3, q3 \n"
|
"vcvt.f32.u32 q3, q3 \n"
|
||||||
"vmul.f32 q2, q2, q0 \n" // adjust exponent
|
"vmul.f32 q2, q2, %y3 \n" // adjust exponent
|
||||||
"vmul.f32 q3, q3, q0 \n"
|
"vmul.f32 q3, q3, %y3 \n"
|
||||||
"vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
|
"vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
|
||||||
"vqshrn.u32 d3, q3, #13 \n"
|
"vqshrn.u32 d3, q3, #13 \n"
|
||||||
"vst1.8 {q1}, [%1]! \n"
|
"vst1.8 {q1}, [%1]! \n"
|
||||||
@ -2655,8 +2656,8 @@ void HalfFloatRow_NEON(const uint16_t* src,
|
|||||||
: "+r"(src), // %0
|
: "+r"(src), // %0
|
||||||
"+r"(dst), // %1
|
"+r"(dst), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
: "r"(scale * 1.9259299444e-34f) // %3
|
: "w"(scale * 1.9259299444e-34f) // %3
|
||||||
: "cc", "memory", "q0", "q1", "q2", "q3");
|
: "cc", "memory", "q1", "q2", "q3");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ByteToFloatRow_NEON(const uint8_t* src,
|
void ByteToFloatRow_NEON(const uint8_t* src,
|
||||||
@ -2664,7 +2665,6 @@ void ByteToFloatRow_NEON(const uint8_t* src,
|
|||||||
float scale,
|
float scale,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"vdup.32 q0, %3 \n"
|
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {d2}, [%0]! \n" // load 8 bytes
|
"vld1.8 {d2}, [%0]! \n" // load 8 bytes
|
||||||
@ -2674,15 +2674,15 @@ void ByteToFloatRow_NEON(const uint8_t* src,
|
|||||||
"vmovl.u16 q3, d3 \n"
|
"vmovl.u16 q3, d3 \n"
|
||||||
"vcvt.f32.u32 q2, q2 \n" // 8 floats
|
"vcvt.f32.u32 q2, q2 \n" // 8 floats
|
||||||
"vcvt.f32.u32 q3, q3 \n"
|
"vcvt.f32.u32 q3, q3 \n"
|
||||||
"vmul.f32 q2, q2, d0[0] \n" // scale
|
"vmul.f32 q2, q2, %y3 \n" // scale
|
||||||
"vmul.f32 q3, q3, d0[0] \n"
|
"vmul.f32 q3, q3, %y3 \n"
|
||||||
"vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
|
"vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src), // %0
|
: "+r"(src), // %0
|
||||||
"+r"(dst), // %1
|
"+r"(dst), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
: "r"(scale) // %3
|
: "w"(scale) // %3
|
||||||
: "cc", "memory", "q0", "q1", "q2", "q3");
|
: "cc", "memory", "q1", "q2", "q3");
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
|
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user