diff --git a/README.chromium b/README.chromium index db0924e97..aed82de2b 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1632 +Version: 1633 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 9a47fd024..b743dc122 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -288,6 +288,7 @@ extern "C" { #define HAS_BGRATOUVROW_NEON #define HAS_BGRATOYROW_NEON #define HAS_COPYROW_NEON +#define HAS_HALFFLOATROW_NEON #define HAS_I400TOARGBROW_NEON #define HAS_I422ALPHATOARGBROW_NEON #define HAS_I422TOARGB1555ROW_NEON @@ -329,11 +330,6 @@ extern "C" { #define HAS_YUY2TOUVROW_NEON #define HAS_YUY2TOYROW_NEON -// TODO(fbarchard): Port to 32 bit. -#if defined(__aarch64__) -#define HAS_HALFFLOATROW_NEON -#endif - // Effects: #define HAS_ARGBADDROW_NEON #define HAS_ARGBATTENUATEROW_NEON diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 55f11a39b..00e11d709 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1632 +#define LIBYUV_VERSION 1633 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_neon.cc b/source/row_neon.cc index dc081fa30..9385b275d 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -2733,7 +2733,63 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, : "cc", "memory", "q0", "q1" // Clobber List ); } -#endif // defined(__ARM_NEON__) && !defined(__aarch64__) + +void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { + asm volatile ( + "vdup.32 q0, %3 \n" + + "1: \n" + MEMACCESS(0) + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u8 q2, d2 \n" // 8 int's + "vmovl.u8 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, q0 \n" // adjust exponent + "vmul.f32 q3, q3, q0 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + MEMACCESS(1) + "vst1.8 {q1}, [%0]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(1.9259299444e-34f) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3" + ); +} + +// TODO(fbarchard): multiply by element. +void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { + asm volatile ( + "vdup.32 q0, %3 \n" + + "1: \n" + MEMACCESS(0) + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u8 q2, d2 \n" // 8 int's + "vmovl.u8 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, q0 \n" // adjust exponent + "vmul.f32 q3, q3, q0 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + MEMACCESS(1) + "vst1.8 {q1}, [%0]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(scale * 1.9259299444e-34f) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3" + ); +} + +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. #ifdef __cplusplus } // extern "C" diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 2f0227cf0..3ec6bab8c 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2718,19 +2718,19 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v2.4s, v1.4h \n" // 8 int's - "uxtl2 v1.4s, v1.8h \n" + "uxtl2 v3.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v1.4s, v1.4s \n" - "fcvtn v4.4h, v2.4s \n" // 8 floatsgit - "fcvtn2 v4.8h, v1.4s \n" + "scvtf v3.4s, v3.4s \n" + "fcvtn v1.4h, v2.4s \n" // 8 floatsgit + "fcvtn2 v1.8h, v3.4s \n" MEMACCESS(1) - "st1 {v4.16b}, [%1], #16 \n" // store 8 shorts + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : - : "cc", "memory", "v1", "v2", "v4" + : "cc", "memory", "v1", "v2", "v3" ); } @@ -2741,21 +2741,21 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v2.4s, v1.4h \n" // 8 int's - "uxtl2 v1.4s, v1.8h \n" + "uxtl2 v3.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v1.4s, v1.4s \n" + "scvtf v3.4s, v3.4s \n" "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent - "fmul v1.4s, v1.4s, %3.s[0] \n" - "uqshrn v4.4h, v2.4s, #13 \n" // isolate halffloat - "uqshrn2 v4.8h, v1.4s, #13 \n" + "fmul v3.4s, v3.4s, %3.s[0] \n" + "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat + "uqshrn2 v1.8h, v3.4s, #13 \n" MEMACCESS(1) - "st1 {v4.16b}, [%1], #16 \n" // store 8 shorts + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "w"(scale * 1.9259299444e-34f) // %3 - : "cc", "memory", "v1", "v2", "v4" + : "cc", "memory", "v1", "v2", "v3" ); }