diff --git a/README.chromium b/README.chromium index 33ba36edb..bffa74dec 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1706 +Version: 1707 License: BSD License File: LICENSE diff --git a/docs/getting_started.md b/docs/getting_started.md index 09297b66a..2b986ab19 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -103,6 +103,10 @@ ios simulator ninja -v -C out/Debug libyuv_unittest ninja -v -C out/Release libyuv_unittest +ios disassembly + + otool -tV ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt + ### Android https://code.google.com/p/chromium/wiki/AndroidBuildInstructions @@ -144,6 +148,8 @@ arm disassembly: third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt + Caveat: Disassembly may require optimize_max be disabled in BUILD.gn + Running tests: build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 5fad06b7c..ecc9f6a7a 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -55,9 +55,9 @@ extern "C" { #endif // clang >= 3.4 #endif // __clang__ -// clang >= 6.0.0 required for AVX512. +// clang >= 7.0.0 required for AVX512. #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) -#if (__clang_major__ >= 6) +#if (__clang_major__ >= 7) #define CLANG_HAS_AVX512 1 #endif // clang >= 6 #endif // __clang__ diff --git a/include/libyuv/version.h b/include/libyuv/version.h index cf8fc3f6d..673976e47 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1706 +#define LIBYUV_VERSION 1707 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_neon.cc b/source/row_neon.cc index 8b6c19520..ff87e74c6 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -2604,12 +2604,15 @@ void SobelYRow_NEON(const uint8_t* src_y0, ); } +// %y passes a float as a scalar vector for vector * scalar multiply. +// the regoster must be d0 to d15 and indexed with [0] or [1] to access +// the float in the first or second float of the d-reg + void HalfFloat1Row_NEON(const uint16_t* src, uint16_t* dst, float /*unused*/, int width) { asm volatile( - "vdup.32 q0, %3 \n" "1: \n" "vld1.8 {q1}, [%0]! \n" // load 8 shorts @@ -2618,8 +2621,8 @@ void HalfFloat1Row_NEON(const uint16_t* src, "vmovl.u16 q3, d3 \n" "vcvt.f32.u32 q2, q2 \n" // 8 floats "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, q0 \n" // adjust exponent - "vmul.f32 q3, q3, q0 \n" + "vmul.f32 q2, q2, %y3 \n" // adjust exponent + "vmul.f32 q3, q3, %y3 \n" "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat "vqshrn.u32 d3, q3, #13 \n" "vst1.8 {q1}, [%1]! \n" @@ -2627,17 +2630,15 @@ void HalfFloat1Row_NEON(const uint16_t* src, : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 - : "r"(1.9259299444e-34f) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3"); + : "w"(1.9259299444e-34f) // %3 + : "cc", "memory", "q1", "q2", "q3"); } -// TODO(fbarchard): multiply by element. void HalfFloatRow_NEON(const uint16_t* src, uint16_t* dst, float scale, int width) { asm volatile( - "vdup.32 q0, %3 \n" "1: \n" "vld1.8 {q1}, [%0]! \n" // load 8 shorts @@ -2646,8 +2647,8 @@ void HalfFloatRow_NEON(const uint16_t* src, "vmovl.u16 q3, d3 \n" "vcvt.f32.u32 q2, q2 \n" // 8 floats "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, q0 \n" // adjust exponent - "vmul.f32 q3, q3, q0 \n" + "vmul.f32 q2, q2, %y3 \n" // adjust exponent + "vmul.f32 q3, q3, %y3 \n" "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat "vqshrn.u32 d3, q3, #13 \n" "vst1.8 {q1}, [%1]! \n" @@ -2655,8 +2656,8 @@ void HalfFloatRow_NEON(const uint16_t* src, : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 - : "r"(scale * 1.9259299444e-34f) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3"); + : "w"(scale * 1.9259299444e-34f) // %3 + : "cc", "memory", "q1", "q2", "q3"); } void ByteToFloatRow_NEON(const uint8_t* src, @@ -2664,7 +2665,6 @@ void ByteToFloatRow_NEON(const uint8_t* src, float scale, int width) { asm volatile( - "vdup.32 q0, %3 \n" "1: \n" "vld1.8 {d2}, [%0]! \n" // load 8 bytes @@ -2674,15 +2674,15 @@ void ByteToFloatRow_NEON(const uint8_t* src, "vmovl.u16 q3, d3 \n" "vcvt.f32.u32 q2, q2 \n" // 8 floats "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, d0[0] \n" // scale - "vmul.f32 q3, q3, d0[0] \n" + "vmul.f32 q2, q2, %y3 \n" // scale + "vmul.f32 q3, q3, %y3 \n" "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 - : "r"(scale) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3"); + : "w"(scale) // %3 + : "cc", "memory", "q1", "q2", "q3"); } #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..