diff --git a/Android.mk b/Android.mk index 717ce85a3..8e4ba7258 100644 --- a/Android.mk +++ b/Android.mk @@ -22,8 +22,10 @@ LOCAL_SRC_FILES := \ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) LOCAL_CFLAGS += -DLIBYUV_NEON LOCAL_SRC_FILES += \ + source/compare_neon.cc \ source/rotate_neon.cc.neon \ - source/row_neon.cc.neon + source/row_neon.cc.neon \ + source/scale_neon.cc endif LOCAL_C_INCLUDES += $(LOCAL_PATH)/include diff --git a/README.chromium b/README.chromium index 81048636b..46e731e6d 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 395 +Version: 396 License: BSD License File: LICENSE diff --git a/include/libyuv/compare.h b/include/libyuv/compare.h index 6a57bb585..5fd924b8c 100644 --- a/include/libyuv/compare.h +++ b/include/libyuv/compare.h @@ -18,7 +18,7 @@ namespace libyuv { extern "C" { #endif -// Compute a hash for specified memory. Seed of 5381 recommended. +// Compute a hash for specified memory. Seed of 5381 recommended. LIBYUV_API uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed); diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index bb4074dc5..1d4b6a5bb 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -73,7 +73,7 @@ int I400ToI420(const uint8* src_y, int src_stride_y, uint8* dst_v, int dst_stride_v, int width, int height); -// Convert NV12 to I420. Also used for NV21. +// Convert NV12 to I420. Also used for NV21. LIBYUV_API int NV12ToI420(const uint8* src_y, int src_stride_y, const uint8* src_uv, int src_stride_uv, @@ -229,7 +229,7 @@ int MJPGToI420(const uint8* sample, size_t sample_size, // Must be less than or equal to src_width/src_height // Cropping parameters are pre-rotation. // "rotation" can be 0, 90, 180 or 270. -// "format" is a fourcc. ie 'I420', 'YUY2' +// "format" is a fourcc. ie 'I420', 'YUY2' // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. LIBYUV_API int ConvertToI420(const uint8* src_frame, size_t src_size, diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h index 1f6495fb1..86085252f 100644 --- a/include/libyuv/convert_argb.h +++ b/include/libyuv/convert_argb.h @@ -19,7 +19,7 @@ // TODO(fbarchard): This set of functions should exactly match convert.h // Add missing V210 and Q420. -// TODO(fbarchard): Add tests. Create random content of right size and convert +// TODO(fbarchard): Add tests. Create random content of right size and convert // with C vs Opt and or to I420 and compare. // TODO(fbarchard): Some of these functions lack parameter setting. @@ -75,7 +75,7 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, uint8* dst_argb, int dst_stride_argb, int width, int height); -// Convert I400 to ARGB. Reverse of ARGBToI400. +// Convert I400 to ARGB. Reverse of ARGBToI400. LIBYUV_API int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, uint8* dst_argb, int dst_stride_argb, @@ -209,7 +209,7 @@ int MJPGToARGB(const uint8* sample, size_t sample_size, // Must be less than or equal to src_width/src_height // Cropping parameters are pre-rotation. // "rotation" can be 0, 90, 180 or 270. -// "format" is a fourcc. ie 'I420', 'YUY2' +// "format" is a fourcc. ie 'I420', 'YUY2' // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. LIBYUV_API int ConvertToARGB(const uint8* src_frame, size_t src_size, diff --git a/include/libyuv/convert_from.h b/include/libyuv/convert_from.h index a659129e9..4eae950cc 100644 --- a/include/libyuv/convert_from.h +++ b/include/libyuv/convert_from.h @@ -50,7 +50,7 @@ int I420ToI411(const uint8* src_y, int src_stride_y, uint8* dst_v, int dst_stride_v, int width, int height); -// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21. +// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21. LIBYUV_API int I400Copy(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, diff --git a/include/libyuv/cpu_id.h b/include/libyuv/cpu_id.h index e8ac4a813..0914f1d23 100644 --- a/include/libyuv/cpu_id.h +++ b/include/libyuv/cpu_id.h @@ -58,7 +58,7 @@ static __inline int TestCpuFlag(int test_flag) { LIBYUV_API void MaskCpuFlags(int enable_flags); -// Low level cpuid for X86. Returns zeros on other CPUs. +// Low level cpuid for X86. Returns zeros on other CPUs. LIBYUV_API void CpuId(int cpu_info[4], int info_type); diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 4b3f25a73..7e43dabb2 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -27,13 +27,31 @@ void SetPlane(uint8* dst_y, int dst_stride_y, int width, int height, uint32 value); +// Alias. +#define I400ToI400 CopyPlane + // Copy a plane of data (I420 to I400). LIBYUV_API void CopyPlane(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, int width, int height); -// Convert I420 to I400. (calls CopyPlane ignoring u/v). +// Convert YUY2 to I422. +LIBYUV_API +int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert UYVY to I422. +int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert I420 to I400. (calls CopyPlane ignoring u/v). LIBYUV_API int I420ToI400(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, @@ -196,7 +214,7 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, const uint8* table_argb, int x, int y, int width, int height); -// Quantize a rectangle of ARGB. Alpha unaffected. +// Quantize a rectangle of ARGB. Alpha unaffected. // scale is a 16 bit fractional fixed point scaler between 0 and 65535. // interval_size should be a value between 1 and 255. // interval_offset should be a value between 0 and 255. @@ -261,7 +279,7 @@ int MJPGToARGB(const uint8* sample, size_t sample_size, int w, int h, int dw, int dh); // Computes table of cumulative sum for image where the value is the sum -// of all values above and to the left of the entry. Used by ARGBBlur. +// of all values above and to the left of the entry. Used by ARGBBlur. LIBYUV_API int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, int32* dst_cumsum, int dst_stride32_cumsum, @@ -299,7 +317,7 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, #define YUV_DISABLE_ASM #endif // Row functions for copying a pixels from a source with a slope to a row -// of destination. Useful for scaling, rotation, mirror, texture mapping. +// of destination. Useful for scaling, rotation, mirror, texture mapping. LIBYUV_API void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, uint8* dst_argb, const float* uv_dudv, int width); diff --git a/include/libyuv/row.h b/include/libyuv/row.h index a2317ddd6..4814f2544 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -66,6 +66,7 @@ extern "C" { #define HAS_RAWTOARGBROW_SSSE3 #define HAS_RGB24TOARGBROW_SSSE3 #define HAS_RGB565TOARGBROW_SSE2 +#define HAS_SETROW_X86 #define HAS_SPLITUV_SSE2 #define HAS_UYVYTOUV422ROW_SSE2 #define HAS_UYVYTOUVROW_SSE2 @@ -76,13 +77,13 @@ extern "C" { #define HAS_YUY2TOYROW_SSE2 // Effects -#define HAS_ARGBMIRRORROW_SSSE3 #define HAS_ARGBAFFINEROW_SSE2 #define HAS_ARGBATTENUATEROW_SSSE3 #define HAS_ARGBBLENDROW_SSSE3 #define HAS_ARGBCOLORMATRIXROW_SSSE3 #define HAS_ARGBGRAYROW_SSSE3 #define HAS_ARGBINTERPOLATEROW_SSSE3 +#define HAS_ARGBMIRRORROW_SSSE3 #define HAS_ARGBQUANTIZEROW_SSE2 #define HAS_ARGBSEPIAROW_SSSE3 #define HAS_ARGBSHADE_SSE2 @@ -93,9 +94,9 @@ extern "C" { // The following are Windows only: #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) +#define HAS_ABGRTOARGBROW_SSSE3 #define HAS_ARGBCOLORTABLEROW_X86 #define HAS_I422TORGBAROW_SSSE3 -#define HAS_ABGRTOARGBROW_SSSE3 #define HAS_RGBATOARGBROW_SSSE3 #define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOYROW_SSSE3 @@ -105,36 +106,42 @@ extern "C" { #if !defined(YUV_DISABLE_ASM) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ !defined(LIBYUV_SSSE3_ONLY) -#define HAS_MIRRORROW_SSE2 #define HAS_ARGBATTENUATE_SSE2 #define HAS_ARGBBLENDROW_SSE2 +#define HAS_MIRRORROW_SSE2 #endif // The following are available on Neon platforms #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) -#define HAS_MIRRORROW_NEON -#define HAS_MIRRORROWUV_NEON -#define HAS_SPLITUV_NEON #define HAS_COPYROW_NEON +#define HAS_I422TOABGRROW_NEON #define HAS_I422TOARGBROW_NEON #define HAS_I422TOBGRAROW_NEON -#define HAS_I422TOABGRROW_NEON +#define HAS_I422TORAWROW_NEON +#define HAS_I422TORGB24ROW_NEON #define HAS_I422TORGBAROW_NEON -#define HAS_YUY2TOUV422ROW_NEON -#define HAS_YUY2TOUVROW_NEON -#define HAS_YUY2TOYROW_NEON +#define HAS_MIRRORROW_NEON +#define HAS_MIRRORROWUV_NEON +#define HAS_SETROW_NEON +#define HAS_SPLITUV_NEON #define HAS_UYVYTOUV422ROW_NEON #define HAS_UYVYTOUVROW_NEON #define HAS_UYVYTOYROW_NEON +#define HAS_YUY2TOUV422ROW_NEON +#define HAS_YUY2TOUVROW_NEON +#define HAS_YUY2TOYROW_NEON + // TODO(fbarchard): Hook these up to calling functions. -#define HAS_ARGBTORGBAROW_NEON -#define HAS_ARGBTORGB24ROW_NEON -#define HAS_ARGBTORAWROW_NEON #define HAS_ABGRTOARGBROW_NEON +#define HAS_ARGBTORAWROW_NEON +#define HAS_ARGBTORGB24ROW_NEON +#define HAS_ARGBTORGBAROW_NEON #define HAS_BGRATOARGBROW_NEON -#define HAS_RGBATOARGBROW_NEON +#define HAS_NV12TOARGBROW_NEON +#define HAS_NV21TOARGBROW_NEON #define HAS_RAWTOARGBROW_NEON #define HAS_RGB24TOARGBROW_NEON +#define HAS_RGBATOARGBROW_NEON #endif #if defined(_MSC_VER) && !defined(__CLR_VER) @@ -189,6 +196,24 @@ void I422ToRGBARow_NEON(const uint8* y_buf, const uint8* v_buf, uint8* rgb_buf, int width); +void I422ToRGB24Row_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void I422ToRAWRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void NV12ToARGBRow_NEON(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width); +void NV21ToARGBRow_NEON(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width); void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); @@ -237,6 +262,15 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count); void CopyRow_NEON(const uint8* src, uint8* dst, int count); void CopyRow_C(const uint8* src, uint8* dst, int count); +void SetRow8_X86(uint8* dst, uint32 v32, int count); +void SetRows32_X86(uint8* dst, uint32 v32, int width, + int dst_stride, int height); +void SetRow8_NEON(uint8* dst, uint32 v32, int count); +void SetRows32_NEON(uint8* dst, uint32 v32, int width, + int dst_stride, int height); +void SetRow8_C(uint8* dst, uint32 v32, int count); +void SetRows32_C(uint8* dst, uint32 v32, int width, int dst_stride, int height); + void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); @@ -341,6 +375,16 @@ void I422ToRGBARow_C(const uint8* y_buf, const uint8* v_buf, uint8* rgba_buf, int width); +void I422ToRGB24Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb24_buf, + int width); +void I422ToRAWRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* raw_buf, + int width); void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, @@ -517,30 +561,44 @@ void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); void RGBAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); - void I422ToARGBRow_Any_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); - void I422ToBGRARow_Any_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); - void I422ToABGRRow_Any_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); - void I422ToRGBARow_Any_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); +void I422ToRGB24Row_Any_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void I422ToRAWRow_Any_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void NV12ToARGBRow_Any_NEON(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width); +void NV21ToARGBRow_Any_NEON(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width); void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, @@ -671,4 +729,3 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, #endif // INCLUDE_LIBYUV_ROW_H_ NOLINT - diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 9d2c099ec..99763b807 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 395 +#define LIBYUV_VERSION 396 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/libyuv.gyp b/libyuv.gyp index 94b95d054..18137538d 100644 --- a/libyuv.gyp +++ b/libyuv.gyp @@ -64,6 +64,7 @@ # sources. 'source/compare.cc', + 'source/compare_neon.cc', 'source/convert.cc', 'source/convert_argb.cc', 'source/convert_from.cc', @@ -79,6 +80,7 @@ 'source/row_posix.cc', 'source/row_win.cc', 'source/scale.cc', + 'source/scale_neon.cc', 'source/scale_argb.cc', 'source/video_common.cc', ], diff --git a/source/compare.cc b/source/compare.cc index 9959f2896..bf4a7daed 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -244,44 +244,10 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { return seed; } -#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) +#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #define HAS_SUMSQUAREERROR_NEON -static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, - int count) { - volatile uint32 sse; - asm volatile ( - "vmov.u8 q7, #0 \n" - "vmov.u8 q9, #0 \n" - "vmov.u8 q8, #0 \n" - "vmov.u8 q10, #0 \n" - - "1: \n" - "vld1.u8 {q0}, [%0]! \n" - "vld1.u8 {q1}, [%1]! \n" - "vsubl.u8 q2, d0, d2 \n" - "vsubl.u8 q3, d1, d3 \n" - "vmlal.s16 q7, d4, d4 \n" - "vmlal.s16 q8, d6, d6 \n" - "vmlal.s16 q8, d5, d5 \n" - "vmlal.s16 q10, d7, d7 \n" - "subs %2, %2, #16 \n" - "bgt 1b \n" - - "vadd.u32 q7, q7, q8 \n" - "vadd.u32 q9, q9, q10 \n" - "vadd.u32 q10, q7, q9 \n" - "vpaddl.u32 q1, q10 \n" - "vadd.u64 d0, d2, d3 \n" - "vmov.32 %3, d0[0] \n" - : "+r"(src_a), - "+r"(src_b), - "+r"(count), - "=r"(sse) - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10"); - return sse; -} +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); #elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_SUMSQUAREERROR_SSE2 diff --git a/source/compare_neon.cc b/source/compare_neon.cc new file mode 100644 index 000000000..7a7eb5fc7 --- /dev/null +++ b/source/compare_neon.cc @@ -0,0 +1,62 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) + +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { + volatile uint32 sse; + asm volatile ( + "vmov.u8 q8, #0 \n" + "vmov.u8 q10, #0 \n" + "vmov.u8 q9, #0 \n" + "vmov.u8 q11, #0 \n" + + ".p2align 2 \n" + "1: \n" + "vld1.u8 {q0}, [%0]! \n" + "vld1.u8 {q1}, [%1]! \n" + "subs %2, %2, #16 \n" + "vsubl.u8 q2, d0, d2 \n" + "vsubl.u8 q3, d1, d3 \n" + "vmlal.s16 q8, d4, d4 \n" + "vmlal.s16 q9, d6, d6 \n" + "vmlal.s16 q10, d5, d5 \n" + "vmlal.s16 q11, d7, d7 \n" + "bgt 1b \n" + + "vadd.u32 q8, q8, q9 \n" + "vadd.u32 q10, q10, q11 \n" + "vadd.u32 q11, q8, q10 \n" + "vpaddl.u32 q1, q11 \n" + "vadd.u64 d0, d2, d3 \n" + "vmov.32 %3, d0[0] \n" + : "+r"(src_a), + "+r"(src_b), + "+r"(count), + "=r"(sse) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); + return sse; +} + +#endif // __ARM_NEON__ + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + diff --git a/source/convert.cc b/source/convert.cc index fac6c674a..0882c92ba 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -62,6 +62,7 @@ int I420Copy(const uint8* src_y, int src_stride_y, return 0; } +// Move to row_win etc. #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_HALFROW_SSE2 __declspec(naked) __declspec(align(16)) @@ -188,7 +189,7 @@ int I422ToI420(const uint8* src_y, int src_stride_y, // Blends 32x2 pixels to 16x1 // source in scale.cc -#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) +#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #define HAS_SCALEROWDOWN2_NEON void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); @@ -393,7 +394,7 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, // M420 format description: // M420 is row biplanar 420: 2 rows of Y and 1 row of UV. // Chroma is half width / half height. (420) -// src_stride_m420 is row planar. Normally this will be the width in pixels. +// src_stride_m420 is row planar. Normally this will be the width in pixels. // The UV plane is half width, but 2 values, so src_stride_m420 applies to // this as well as the two Y planes. static int X420ToI420(const uint8* src_y, @@ -592,10 +593,10 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, // This policy assumes that the caller handles the last row of an odd height // image using C. // READSAFE_PAGE - enable read ahead within same page. -// A page is 4096 bytes. When reading ahead, if the last pixel is near the +// A page is 4096 bytes. When reading ahead, if the last pixel is near the // end the page, and a read spans the page into the next page, a memory // exception can occur if that page has not been allocated, or is a guard -// page. This setting ensures the overread is within the same page. +// page. This setting ensures the overread is within the same page. // READSAFE_ALWAYS - enables read ahead on systems without memory exceptions // or where buffers are padded by 64 bytes. @@ -790,7 +791,7 @@ static inline uint32 READWORD(const uint8* p) { } #endif -// Must be multiple of 6 pixels. Will over convert to handle remainder. +// Must be multiple of 6 pixels. Will over convert to handle remainder. // https://developer.apple.com/quicktime/icefloe/dispatch019.html#v210 static void V210ToUYVYRow_C(const uint8* src_v210, uint8* dst_uyvy, int width) { for (int x = 0; x < width; x += 6) { @@ -820,7 +821,7 @@ static void V210ToUYVYRow_C(const uint8* src_v210, uint8* dst_uyvy, int width) { } // Convert V210 to I420. -// V210 is 10 bit version of UYVY. 16 bytes to store 6 pixels. +// V210 is 10 bit version of UYVY. 16 bytes to store 6 pixels. // With is multiple of 48. LIBYUV_API int V210ToI420(const uint8* src_v210, int src_stride_v210, @@ -1611,7 +1612,7 @@ static void JpegI400ToI420(void* opaque, } // MJPG (Motion JPeg) to I420 -// TODO(fbarchard): review w and h requirement. dw and dh may be enough. +// TODO(fbarchard): review w and h requirement. dw and dh may be enough. LIBYUV_API int MJPGToI420(const uint8* sample, size_t sample_size, @@ -1689,7 +1690,7 @@ int MJPGToI420(const uint8* sample, ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh); } else { // TODO(fbarchard): Implement conversion for any other colorspace/sample - // factors that occur in practice. 411 is supported by libjpeg + // factors that occur in practice. 411 is supported by libjpeg // ERROR: Unable to convert MJPEG frame because format is not supported mjpeg_decoder.UnloadFrame(); return 1; @@ -1734,7 +1735,7 @@ int ConvertToI420(const uint8* sample, } int r = 0; - // One pass rotation is available for some formats. For the rest, convert + // One pass rotation is available for some formats. For the rest, convert // to I420 (with optional vertical flipping) into a temporary I420 buffer, // and then rotate the I420 to the final destination buffer. // For in-place conversion, if destination y is same as source sample, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 2f1acf154..1c5aa9d9f 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -556,6 +556,14 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_NV12TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + NV12ToARGBRow = NV12ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_NEON; + } + } +#endif for (int y = 0; y < height; ++y) { NV12ToARGBRow(src_y, src_uv, dst_argb, width); @@ -571,10 +579,10 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, // Convert NV21 to ARGB. LIBYUV_API int NV21ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_vu, int src_stride_vu, + const uint8* src_uv, int src_stride_uv, uint8* dst_argb, int dst_stride_argb, int width, int height) { - if (!src_y || !src_vu || !dst_argb || + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -585,7 +593,7 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y, dst_stride_argb = -dst_stride_argb; } void (*NV21ToARGBRow)(const uint8* y_buf, - const uint8* vu_buf, + const uint8* uv_buf, uint8* rgb_buf, int width) = NV21ToARGBRow_C; #if defined(HAS_NV21TOARGBROW_SSSE3) @@ -599,13 +607,21 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_NV21TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + NV21ToARGBRow = NV21ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_NEON; + } + } +#endif for (int y = 0; y < height; ++y) { - NV21ToARGBRow(src_y, src_vu, dst_argb, width); + NV21ToARGBRow(src_y, src_uv, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { - src_vu += src_stride_vu; + src_uv += src_stride_uv; } } return 0; @@ -890,7 +906,7 @@ static void JpegI400ToARGB(void* opaque, } // MJPG (Motion JPeg) to ARGB -// TODO(fbarchard): review w and h requirement. dw and dh may be enough. +// TODO(fbarchard): review w and h requirement. dw and dh may be enough. LIBYUV_API int MJPGToARGB(const uint8* sample, size_t sample_size, @@ -966,7 +982,7 @@ int MJPGToARGB(const uint8* sample, ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh); } else { // TODO(fbarchard): Implement conversion for any other colorspace/sample - // factors that occur in practice. 411 is supported by libjpeg + // factors that occur in practice. 411 is supported by libjpeg // ERROR: Unable to convert MJPEG frame because format is not supported mjpeg_decoder.UnloadFrame(); return 1; @@ -1004,7 +1020,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, } int r = 0; - // One pass rotation is available for some formats. For the rest, convert + // One pass rotation is available for some formats. For the rest, convert // to I420 (with optional vertical flipping) into a temporary I420 buffer, // and then rotate the I420 to the final destination buffer. // For in-place conversion, if destination dst_argb is same as source sample, diff --git a/source/convert_from.cc b/source/convert_from.cc index bfa56ec80..4ea974acf 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -203,7 +203,7 @@ int I420ToI411(const uint8* src_y, int src_stride_y, return 0; } -// Copy to I400. Source can be I420,422,444,400,NV12,NV21 +// Copy to I400. Source can be I420,422,444,400,NV12,NV21 LIBYUV_API int I400Copy(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, @@ -895,68 +895,50 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y, } // Convert I420 to RGB24. -// TODO(fbarchard): One step I420ToRGB24Row_NEON. LIBYUV_API int I420ToRGB24(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, + uint8* dst_rgb24, int dst_stride_rgb24, int width, int height) { if (!src_y || !src_u || !src_v || - !dst_argb || + !dst_rgb24 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; } - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = I422ToARGBRow_C; -#if defined(HAS_I422TOARGBROW_NEON) + void (*I422ToRGB24Row)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRGB24Row_C; +#if defined(HAS_I422TORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_NEON; - } -#elif defined(HAS_I422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } -#endif - - SIMD_ALIGNED(uint8 row[kMaxStride]); - void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = - ARGBToRGB24Row_C; -#if defined(HAS_ARGBTORGB24ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - if (width * 3 <= kMaxStride) { - ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; - } - if (IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; + I422ToRGB24Row = I422ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_NEON; } } -#endif -#if defined(HAS_ARGBTORGB24ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - if (width * 3 <= kMaxStride) { - ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; - } +#elif defined(HAS_I422TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBToRGB24Row = ARGBToRGB24Row_NEON; + I422ToRGB24Row = I422ToRGB24Row_Unaligned_SSSE3; + if (IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) { + I422ToRGB24Row = I422ToRGB24Row_SSSE3; + } } } #endif for (int y = 0; y < height; ++y) { - I422ToARGBRow(src_y, src_u, src_v, row, width); - ARGBToRGB24Row(row, dst_argb, width); - dst_argb += dst_stride_argb; + I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width); + dst_rgb24 += dst_stride_rgb24; src_y += src_stride_y; if (y & 1) { src_u += src_stride_u; @@ -967,67 +949,50 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, } // Convert I420 to RAW. -// TODO(fbarchard): One step I420ToRAWRow_NEON. LIBYUV_API int I420ToRAW(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, + uint8* dst_raw, int dst_stride_raw, int width, int height) { if (!src_y || !src_u || !src_v || - !dst_argb || + !dst_raw || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; + dst_raw = dst_raw + (height - 1) * dst_stride_raw; + dst_stride_raw = -dst_stride_raw; } - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = I422ToARGBRow_C; -#if defined(HAS_I422TOARGBROW_NEON) + void (*I422ToRAWRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRAWRow_C; +#if defined(HAS_I422TORAWROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_NEON; - } -#elif defined(HAS_I422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } -#endif - - SIMD_ALIGNED(uint8 row[kMaxStride]); - void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) = - ARGBToRAWRow_C; -#if defined(HAS_ARGBTORAWROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - if (width * 3 <= kMaxStride) { - ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; - } - if (IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGBToRAWRow = ARGBToRAWRow_SSSE3; + I422ToRAWRow = I422ToRAWRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToRAWRow = I422ToRAWRow_NEON; } } -#elif defined(HAS_ARGBTORAWROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - if (width * 3 <= kMaxStride) { - ARGBToRAWRow = ARGBToRAWRow_Any_NEON; - } +#elif defined(HAS_I422TORAWROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToRAWRow = I422ToRAWRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBToRAWRow = ARGBToRAWRow_NEON; + I422ToRAWRow = I422ToRAWRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) { + I422ToRAWRow = I422ToRAWRow_SSSE3; + } } } #endif for (int y = 0; y < height; ++y) { - I422ToARGBRow(src_y, src_u, src_v, row, width); - ARGBToRAWRow(row, dst_argb, width); - dst_argb += dst_stride_argb; + I422ToRAWRow(src_y, src_u, src_v, dst_raw, width); + dst_raw += dst_stride_raw; src_y += src_stride_y; if (y & 1) { src_u += src_stride_u; diff --git a/source/cpu_id.cc b/source/cpu_id.cc index 941e399e5..2e96d9b9b 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -29,7 +29,7 @@ // TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux. #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) static __inline void __cpuid(int cpu_info[4], int info_type) { - asm volatile ( + asm volatile ( // NOLINT "mov %%ebx, %%edi \n" "cpuid \n" "xchg %%edi, %%ebx \n" @@ -38,7 +38,7 @@ static __inline void __cpuid(int cpu_info[4], int info_type) { } #elif defined(__i386__) || defined(__x86_64__) static __inline void __cpuid(int cpu_info[4], int info_type) { - asm volatile ( + asm volatile ( // NOLINT "cpuid \n" : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) : "a"(info_type)); @@ -50,7 +50,7 @@ namespace libyuv { extern "C" { #endif -// Low level cpuid for X86. Returns zeros on other CPUs. +// Low level cpuid for X86. Returns zeros on other CPUs. #if !defined(__CLR_VER) && (defined(_M_IX86) || defined(_M_X64) || \ defined(__i386__) || defined(__x86_64__)) LIBYUV_API @@ -85,7 +85,7 @@ static uint32 XGetBV(unsigned int xcr) { #define HAS_XGETBV static uint32 XGetBV(unsigned int xcr) { uint32 xcr_feature_mask; - asm volatile ( + asm volatile ( // NOLINT ".byte 0x0f, 0x01, 0xd0\n" : "=a"(xcr_feature_mask) : "c"(xcr) @@ -124,6 +124,18 @@ int ArmCpuCaps(const char* cpuinfo_name) { LIBYUV_API int cpu_info_ = 0; +// Test environment variable for disabling CPU features. Any non-zero value +// to disable. Zero ignored to make it easy to set the variable on/off. +static bool TestEnv(const char* name) { + const char* var = getenv(name); + if (var) { + if (var[0] != '0') { + return true; + } + } + return false; +} + LIBYUV_API int InitCpuFlags(void) { #if !defined(__CLR_VER) && defined(CPU_X86) @@ -144,34 +156,33 @@ int InitCpuFlags(void) { } } #endif - // environment variable overrides for testing. - if (getenv("LIBYUV_DISABLE_X86")) { + if (TestEnv("LIBYUV_DISABLE_X86")) { cpu_info_ &= ~kCpuHasX86; } - if (getenv("LIBYUV_DISABLE_SSE2")) { + if (TestEnv("LIBYUV_DISABLE_SSE2")) { cpu_info_ &= ~kCpuHasSSE2; } - if (getenv("LIBYUV_DISABLE_SSSE3")) { + if (TestEnv("LIBYUV_DISABLE_SSSE3")) { cpu_info_ &= ~kCpuHasSSSE3; } - if (getenv("LIBYUV_DISABLE_SSE41")) { + if (TestEnv("LIBYUV_DISABLE_SSE41")) { cpu_info_ &= ~kCpuHasSSE41; } - if (getenv("LIBYUV_DISABLE_SSE42")) { + if (TestEnv("LIBYUV_DISABLE_SSE42")) { cpu_info_ &= ~kCpuHasSSE42; } - if (getenv("LIBYUV_DISABLE_AVX")) { + if (TestEnv("LIBYUV_DISABLE_AVX")) { cpu_info_ &= ~kCpuHasAVX; } - if (getenv("LIBYUV_DISABLE_AVX2")) { + if (TestEnv("LIBYUV_DISABLE_AVX2")) { cpu_info_ &= ~kCpuHasAVX2; } - if (getenv("LIBYUV_DISABLE_ASM")) { + if (TestEnv("LIBYUV_DISABLE_ASM")) { cpu_info_ = kCpuInitialized; } #elif defined(__arm__) -#if defined(__linux__) && defined(__ARM_NEON__) +#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) // linux arm parse text file for neon detect. cpu_info_ = ArmCpuCaps("/proc/cpuinfo"); #elif defined(__ARM_NEON__) @@ -181,10 +192,10 @@ int InitCpuFlags(void) { cpu_info_ = kCpuHasNEON; #endif cpu_info_ |= kCpuInitialized | kCpuHasARM; - if (getenv("LIBYUV_DISABLE_NEON")) { + if (TestEnv("LIBYUV_DISABLE_NEON")) { cpu_info_ &= ~kCpuHasNEON; } - if (getenv("LIBYUV_DISABLE_ASM")) { + if (TestEnv("LIBYUV_DISABLE_ASM")) { cpu_info_ = kCpuInitialized; } #endif // __arm__ diff --git a/source/format_conversion.cc b/source/format_conversion.cc index 4062c25b8..ed12de88d 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -21,7 +21,7 @@ extern "C" { #endif // Note: to do this with Neon vld4.8 would load ARGB values into 4 registers -// and vst would select which 2 components to write. The low level would need +// and vst would select which 2 components to write. The low level would need // to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) diff --git a/source/mjpeg_decoder.cc b/source/mjpeg_decoder.cc index 47840116b..aa603947a 100644 --- a/source/mjpeg_decoder.cc +++ b/source/mjpeg_decoder.cc @@ -10,6 +10,7 @@ #include "libyuv/mjpeg_decoder.h" +#ifdef HAVE_JPEG // Must be included before jpeglib #include #ifndef __CLR_VER @@ -80,7 +81,7 @@ MJpegDecoder::~MJpegDecoder() { } // Helper function to validate the jpeg looks ok. -// TODO(fbarchard): Improve performance. Scan backward for EOI? +// TODO(fbarchard): Improve performance. Scan backward for EOI? bool ValidateJpeg(const uint8* sample, size_t sample_size) { if (sample_size < 64) { // ERROR: Invalid jpeg size: sample_size @@ -105,7 +106,7 @@ bool ValidateJpeg(const uint8* sample, size_t sample_size) { } } if (!total_eoi) { - // ERROR: Invalid jpeg end code not found. Size sample_size + // ERROR: Invalid jpeg end code not found. Size sample_size return false; } return true; @@ -578,3 +579,5 @@ JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper( } } // namespace libyuv +#endif // HAVE_JPEG + diff --git a/source/planar_functions.cc b/source/planar_functions.cc index b972d6bd8..a7f5086a1 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -105,6 +105,130 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, } } +// Convert YUY2 to I422. +LIBYUV_API +int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + void (*YUY2ToUV422Row)(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); + void (*YUY2ToYRow)(const uint8* src_yuy2, + uint8* dst_y, int pix); + YUY2ToYRow = YUY2ToYRow_C; + YUY2ToUV422Row = YUY2ToUV422Row_C; +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + if (width > 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + } + if (IS_ALIGNED(width, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2; + YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } + } + } +#elif defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width > 8) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (width > 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; + } + } + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + YUY2ToUV422Row = YUY2ToUV422Row_NEON; + } + } +#endif + + for (int y = 0; y < height; ++y) { + YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + src_yuy2 += src_stride_yuy2; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// Convert UYVY to I422. +LIBYUV_API +int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } + void (*UYVYToUV422Row)(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); + void (*UYVYToYRow)(const uint8* src_uyvy, + uint8* dst_y, int pix); + UYVYToYRow = UYVYToYRow_C; + UYVYToUV422Row = UYVYToUV422Row_C; +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + if (width > 16) { + UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; + UYVYToYRow = UYVYToYRow_Any_SSE2; + } + if (IS_ALIGNED(width, 16)) { + UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2; + UYVYToYRow = UYVYToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { + UYVYToUV422Row = UYVYToUV422Row_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + UYVYToYRow = UYVYToYRow_SSE2; + } + } + } + } +#elif defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width > 8) { + UYVYToYRow = UYVYToYRow_Any_NEON; + if (width > 16) { + UYVYToUV422Row = UYVYToUV422Row_Any_NEON; + } + } + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_NEON; + UYVYToUV422Row = UYVYToUV422Row_NEON; + } + } +#endif + + for (int y = 0; y < height; ++y) { + UYVYToUV422Row(src_uyvy, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + src_uyvy += src_stride_uyvy; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + // Mirror I420 with optional flipping LIBYUV_API int I420Mirror(const uint8* src_y, int src_stride_y, @@ -721,6 +845,11 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, NV12ToARGBRow = NV12ToARGBRow_SSSE3; } #endif +#if defined(HAS_NV12TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width * 4 <= kMaxStride) { + NV12ToARGBRow = NV12ToARGBRow_NEON; + } +#endif SIMD_ALIGNED(uint8 row[kMaxStride]); void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = @@ -789,129 +918,6 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y, return 0; } -// SetRow8 writes 'count' bytes using a 32 bit value repeated -// SetRow32 writes 'count' words using a 32 bit value repeated - -#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) -#define HAS_SETROW_NEON -static void SetRow8_NEON(uint8* dst, uint32 v32, int count) { - asm volatile ( // NOLINT - "vdup.u32 q0, %2 \n" // duplicate 4 ints - "1: \n" - "subs %1, %1, #16 \n" // 16 bytes per loop - "vst1.u32 {q0}, [%0]! \n" // store - "bgt 1b \n" - : "+r"(dst), // %0 - "+r"(count) // %1 - : "r"(v32) // %2 - : "q0", "memory", "cc"); -} - -// TODO(fbarchard): Make fully assembler -static void SetRows32_NEON(uint8* dst, uint32 v32, int width, - int dst_stride, int height) { - for (int y = 0; y < height; ++y) { - SetRow8_NEON(dst, v32, width << 2); - dst += dst_stride; - } -} - -#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86) -#define HAS_SETROW_X86 -__declspec(naked) __declspec(align(16)) -static void SetRow8_X86(uint8* dst, uint32 v32, int count) { - __asm { - mov edx, edi - mov edi, [esp + 4] // dst - mov eax, [esp + 8] // v32 - mov ecx, [esp + 12] // count - shr ecx, 2 - rep stosd - mov edi, edx - ret - } -} - -__declspec(naked) __declspec(align(16)) -static void SetRows32_X86(uint8* dst, uint32 v32, int width, - int dst_stride, int height) { - __asm { - push esi - push edi - push ebp - mov edi, [esp + 12 + 4] // dst - mov eax, [esp + 12 + 8] // v32 - mov ebp, [esp + 12 + 12] // width - mov edx, [esp + 12 + 16] // dst_stride - mov esi, [esp + 12 + 20] // height - lea ecx, [ebp * 4] - sub edx, ecx // stride - width * 4 - - align 16 - convertloop: - mov ecx, ebp - rep stosd - add edi, edx - sub esi, 1 - jg convertloop - - pop ebp - pop edi - pop esi - ret - } -} - -#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) -#define HAS_SETROW_X86 -static void SetRow8_X86(uint8* dst, uint32 v32, int width) { - size_t width_tmp = static_cast(width); - asm volatile ( // NOLINT - "shr $0x2,%1 \n" - "rep stosl \n" - : "+D"(dst), // %0 - "+c"(width_tmp) // %1 - : "a"(v32) // %2 - : "memory", "cc"); -} - -static void SetRows32_X86(uint8* dst, uint32 v32, int width, - int dst_stride, int height) { - for (int y = 0; y < height; ++y) { - size_t width_tmp = static_cast(width); - uint32* d = reinterpret_cast(dst); - asm volatile ( // NOLINT - "rep stosl \n" - : "+D"(d), // %0 - "+c"(width_tmp) // %1 - : "a"(v32) // %2 - : "memory", "cc"); - dst += dst_stride; - } -} -#endif - -static void SetRow8_C(uint8* dst, uint32 v8, int count) { -#ifdef _MSC_VER - for (int x = 0; x < count; ++x) { - dst[x] = v8; - } -#else - memset(dst, v8, count); -#endif -} - -static void SetRows32_C(uint8* dst, uint32 v32, int width, - int dst_stride, int height) { - for (int y = 0; y < height; ++y) { - uint32* d = reinterpret_cast(dst); - for (int x = 0; x < width; ++x) { - d[x] = v32; - } - dst += dst_stride; - } -} - LIBYUV_API void SetPlane(uint8* dst_y, int dst_stride_y, int width, int height, @@ -929,13 +935,6 @@ void SetPlane(uint8* dst_y, int dst_stride_y, SetRow = SetRow8_X86; } #endif -#if defined(HAS_SETROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - SetRow = SetRow8_SSE2; - } -#endif uint32 v32 = value | (value << 8) | (value << 16) | (value << 24); // Set plane @@ -1242,7 +1241,7 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, } // Computes table of cumulative sum for image where the value is the sum -// of all values above and to the left of the entry. Used by ARGBBlur. +// of all values above and to the left of the entry. Used by ARGBBlur. LIBYUV_API int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, int32* dst_cumsum, int dst_stride32_cumsum, @@ -1270,7 +1269,7 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, // Blur ARGB image. // Caller should allocate CumulativeSum table of width * height * 16 bytes -// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory +// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory // as the buffer is treated as circular. LIBYUV_API int ARGBBlur(const uint8* src_argb, int src_stride_argb, @@ -1290,7 +1289,7 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb, CumulativeSumToAverage = CumulativeSumToAverage_SSE2; } #endif - // Compute enough CumulativeSum for first row to be blurred. After this + // Compute enough CumulativeSum for first row to be blurred. After this // one row of CumulativeSum is updated at a time. ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum, dst_stride32_cumsum, diff --git a/source/rotate.cc b/source/rotate.cc index 5ca1ca747..cac3fa0b0 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -814,7 +814,7 @@ void RotatePlane90(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width, int height) { // Rotate by 90 is a transpose with the source read - // from bottom to top. So set the source pointer to the end + // from bottom to top. So set the source pointer to the end // of the buffer and flip the sign of the source stride. src += src_stride * (height - 1); src_stride = -src_stride; @@ -826,7 +826,7 @@ void RotatePlane270(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width, int height) { // Rotate by 270 is a transpose with the destination written - // from bottom to top. So set the destination pointer to the end + // from bottom to top. So set the destination pointer to the end // of the buffer and flip the sign of the destination stride. dst += dst_stride * (width - 1); dst_stride = -dst_stride; @@ -880,7 +880,7 @@ void RotatePlane180(const uint8* src, int src_stride, if (width > kMaxStride) { return; } - // Swap first and last row and mirror the content. Uses a temporary row. + // Swap first and last row and mirror the content. Uses a temporary row. SIMD_ALIGNED(uint8 row[kMaxStride]); const uint8* src_bot = src + src_stride * (height - 1); uint8* dst_bot = dst + dst_stride * (height - 1); diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index 33e1050db..9c9944674 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -58,7 +58,7 @@ void ARGBRotate90(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width, int height) { // Rotate by 90 is a ARGBTranspose with the source read - // from bottom to top. So set the source pointer to the end + // from bottom to top. So set the source pointer to the end // of the buffer and flip the sign of the source stride. src += src_stride * (height - 1); src_stride = -src_stride; @@ -69,7 +69,7 @@ void ARGBRotate270(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width, int height) { // Rotate by 270 is a ARGBTranspose with the destination written - // from bottom to top. So set the destination pointer to the end + // from bottom to top. So set the destination pointer to the end // of the buffer and flip the sign of the destination stride. dst += dst_stride * (width - 1); dst_stride = -dst_stride; @@ -109,7 +109,7 @@ void ARGBRotate180(const uint8* src, int src_stride, if (width * 4 > kMaxStride) { return; } - // Swap first and last row and mirror the content. Uses a temporary row. + // Swap first and last row and mirror the content. Uses a temporary row. SIMD_ALIGNED(uint8 row[kMaxStride]); const uint8* src_bot = src + src_stride * (height - 1); uint8* dst_bot = dst + dst_stride * (height - 1); diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc index f8a0c8e7f..49b300325 100644 --- a/source/rotate_neon.cc +++ b/source/rotate_neon.cc @@ -26,12 +26,12 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter // at w-8 allow for this "sub %4, #8 \n" - // handle 8x8 blocks. this should be the majority of the plane + // handle 8x8 blocks. this should be the majority of the plane ".p2align 4 \n" "1: \n" "mov r9, %0 \n" @@ -81,7 +81,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, "subs %4, #8 \n" // w -= 8 "bge 1b \n" - // add 8 back to counter. if the result is 0 there are + // add 8 back to counter. if the result is 0 there are // no residuals. "adds %4, #8 \n" "beq 4f \n" @@ -193,12 +193,12 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, uint8* dst_b, int dst_stride_b, int width) { asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter // at w-8 allow for this "sub %6, #8 \n" - // handle 8x8 blocks. this should be the majority of the plane + // handle 8x8 blocks. this should be the majority of the plane ".p2align 4 \n" "1: \n" "mov r9, %0 \n" @@ -264,7 +264,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "subs %6, #8 \n" // w -= 8 "bge 1b \n" - // add 8 back to counter. if the result is 0 there are + // add 8 back to counter. if the result is 0 there are // no residuals. "adds %6, #8 \n" "beq 4f \n" diff --git a/source/row_common.cc b/source/row_common.cc index 07596506d..c5f3ce050 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -330,7 +330,7 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) { int sb = (b * 17 + g * 68 + r * 35) >> 7; int sg = (b * 22 + g * 88 + r * 45) >> 7; int sr = (b * 24 + g * 98 + r * 50) >> 7; - // b does not over flow. a is preserved from original. + // b does not over flow. a is preserved from original. if (sg > 255) { sg = 255; } @@ -344,7 +344,7 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) { } } -// Apply color matrix to a row of image. Matrix is signed. +// Apply color matrix to a row of image. Matrix is signed. void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) { for (int x = 0; x < width; ++x) { int b = dst_argb[0]; @@ -459,6 +459,14 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf, (255u << ashift); } +static __inline void YuvPixel2(uint8 y, uint8 u, uint8 v, + uint8* b, uint8* g, uint8* r) { + int32 y1 = (static_cast(y) - 16) * YG; + *b = Clip(static_cast((u * UB + v * VB) - (BB) + y1) >> 6); + *g = Clip(static_cast((u * UG + v * VG) - (BG) + y1) >> 6); + *r = Clip(static_cast((u * UR + v * VR) - (BR) + y1) >> 6); +} + void I444ToARGBRow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -492,6 +500,48 @@ void I422ToARGBRow_C(const uint8* y_buf, } } +void I422ToRGB24Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 1; x += 2) { + YuvPixel2(y_buf[0], u_buf[0], v_buf[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + YuvPixel2(y_buf[1], u_buf[0], v_buf[0], + rgb_buf + 3, rgb_buf + 4, rgb_buf + 5); + y_buf += 2; + u_buf += 1; + v_buf += 1; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel2(y_buf[0], u_buf[0], v_buf[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + } +} + +void I422ToRAWRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 1; x += 2) { + YuvPixel2(y_buf[0], u_buf[0], v_buf[0], + rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + YuvPixel2(y_buf[1], u_buf[0], v_buf[0], + rgb_buf + 5, rgb_buf + 4, rgb_buf + 3); + y_buf += 2; + u_buf += 1; + v_buf += 1; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel2(y_buf[0], u_buf[0], v_buf[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + } +} + void I411ToARGBRow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -671,6 +721,28 @@ void CopyRow_C(const uint8* src, uint8* dst, int count) { memcpy(dst, src, count); } +void SetRow8_C(uint8* dst, uint32 v8, int count) { +#ifdef _MSC_VER + // VC will generate rep stosb. + for (int x = 0; x < count; ++x) { + dst[x] = v8; + } +#else + memset(dst, v8, count); +#endif +} + +void SetRows32_C(uint8* dst, uint32 v32, int width, + int dst_stride, int height) { + for (int y = 0; y < height; ++y) { + uint32* d = reinterpret_cast(dst); + for (int x = 0; x < width; ++x) { + d[x] = v32; + } + dst += dst_stride; + } +} + // Filter 2 rows of YUY2 UV's (422) into U and V (420). void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_u, uint8* dst_v, int width) { @@ -950,6 +1022,11 @@ Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0) YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1) YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1) #endif +#ifdef HAS_I422TORGB24ROW_SSSE3 +YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_Unaligned_SSSE3, \ + I422ToRGB24Row_C, 1) +YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_Unaligned_SSSE3, I422ToRAWRow_C, 1) +#endif #ifdef HAS_I422TORGBAROW_SSSE3 YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1) #endif @@ -958,6 +1035,10 @@ YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1) YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1) YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1) YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1) +Y2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0) +Y2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0) +YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1) +YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1) #endif #undef YANY diff --git a/source/row_neon.cc b/source/row_neon.cc index d6b9b31d0..19a783305 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -101,8 +101,8 @@ void I422ToARGBRow_NEON(const uint8* y_buf, "+r"(width) // %4 : "r"(&kUVToRB), // %5 "r"(&kUVToG) // %6 - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15" + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } #endif // HAS_I422TOARGBROW_NEON @@ -135,8 +135,8 @@ void I422ToBGRARow_NEON(const uint8* y_buf, "+r"(width) // %4 : "r"(&kUVToRB), // %5 "r"(&kUVToG) // %6 - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } #endif // HAS_I422TOBGRAROW_NEON @@ -169,8 +169,8 @@ void I422ToABGRRow_NEON(const uint8* y_buf, "+r"(width) // %4 : "r"(&kUVToRB), // %5 "r"(&kUVToG) // %6 - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } #endif // HAS_I422TOABGRROW_NEON @@ -202,12 +202,77 @@ void I422ToRGBARow_NEON(const uint8* y_buf, "+r"(width) // %4 : "r"(&kUVToRB), // %5 "r"(&kUVToG) // %6 - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } #endif // HAS_I422TORGBAROW_NEON +#ifdef HAS_I422TORGB24ROW_NEON +void I422ToRGB24Row_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + asm volatile ( + "vld1.u8 {d24}, [%5] \n" + "vld1.u8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TORGB24ROW_NEON + +#ifdef HAS_I422TORAWROW_NEON +void I422ToRAWRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + asm volatile ( + "vld1.u8 {d24}, [%5] \n" + "vld1.u8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TORAWROW_NEON + #ifdef HAS_NV12TOARGBROW_NEON void NV12ToARGBRow_NEON(const uint8* y_buf, const uint8* uv_buf, @@ -233,8 +298,8 @@ void NV12ToARGBRow_NEON(const uint8* y_buf, "+r"(width) // %3 : "r"(&kUVToRB), // %4 "r"(&kUVToG) // %5 - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15" + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } #endif // HAS_NV12TOARGBROW_NEON @@ -264,8 +329,8 @@ void NV21ToARGBRow_NEON(const uint8* y_buf, "+r"(width) // %3 : "r"(&kUVToRB), // %4 "r"(&kUVToG) // %5 - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15" + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } #endif // HAS_NV21TOARGBROW_NEON @@ -312,7 +377,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { #endif // HAS_COPYROW_NEON #ifdef HAS_SETROW_NEON -// SetRow8 writes 'count' bytes using a 32 bit value repeated +// SetRow8 writes 'count' bytes using a 32 bit value repeated. void SetRow8_NEON(uint8* dst, uint32 v32, int count) { asm volatile ( // NOLINT "vdup.u32 q0, %2 \n" // duplicate 4 ints @@ -327,7 +392,7 @@ void SetRow8_NEON(uint8* dst, uint32 v32, int count) { } // TODO(fbarchard): Make fully assembler -// SetRow32 writes 'count' words using a 32 bit value repeated +// SetRow32 writes 'count' words using a 32 bit value repeated. void SetRows32_NEON(uint8* dst, uint32 v32, int width, int dst_stride, int height) { for (int y = 0; y < height; ++y) { @@ -344,11 +409,11 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { "add %1, %2 \n" // work on segments that are multiples of 16 "lsrs r3, %2, #4 \n" - // the output is written in two block. 8 bytes followed - // by another 8. reading is done sequentially, from left to - // right. writing is done from right to left in block sizes + // the output is written in two block. 8 bytes followed + // by another 8. reading is done sequentially, from left to + // right. writing is done from right to left in block sizes // %1, the destination pointer is incremented after writing - // the first of the two blocks. need to subtract that 8 off + // the first of the two blocks. need to subtract that 8 off // along with 16 to get the next location. "mov r3, #-24 \n" "beq 2f \n" @@ -356,9 +421,9 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { // back of destination by the size of the register that is // going to be mirrored "sub %1, #16 \n" - // the loop needs to run on blocks of 16. what will be left + // the loop needs to run on blocks of 16. what will be left // over is either a negative number, the residuals that need - // to be done, or 0. if this isn't subtracted off here the + // to be done, or 0. If this isn't subtracted off here the // loop will run one extra time. "sub %2, #16 \n" @@ -375,7 +440,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { "vst1.8 {d0}, [%1], r3 \n" // dst -= 16 "bge 1b \n" - // add 16 back to the counter. if the result is 0 there is no + // add 16 back to the counter. if the result is 0 there is no // residuals so jump past "adds %2, #16 \n" "beq 5f \n" @@ -430,9 +495,9 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { // going to be mirrord "sub %1, #8 \n" "sub %2, #8 \n" - // the loop needs to run on blocks of 8. what will be left + // the loop needs to run on blocks of 8. what will be left // over is either a negative number, the residuals that need - // to be done, or 0. if this isn't subtracted off here the + // to be done, or 0. if this isn't subtracted off here the // loop will run one extra time. "sub %3, #8 \n" @@ -446,7 +511,7 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { "vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8 "bge 1b \n" - // add 8 back to the counter. if the result is 0 there is no + // add 8 back to the counter. if the result is 0 there is no // residuals so return "adds %3, #8 \n" "beq 4f \n" diff --git a/source/row_posix.cc b/source/row_posix.cc index 65416c0f7..33149dada 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -741,9 +741,9 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { } // TODO(fbarchard): pass xmm constants to single block of assembly. -// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes -// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers, -// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around +// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes +// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers, +// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around // and considered unsafe. void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { @@ -2143,6 +2143,34 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) { } #endif // HAS_COPYROW_X86 +#ifdef HAS_SETROW_X86 +void SetRow8_X86(uint8* dst, uint32 v32, int width) { + size_t width_tmp = static_cast(width); + asm volatile ( + "shr $0x2,%1 \n" + "rep stosl \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); +} + +void SetRows32_X86(uint8* dst, uint32 v32, int width, + int dst_stride, int height) { + for (int y = 0; y < height; ++y) { + size_t width_tmp = static_cast(width); + uint32* d = reinterpret_cast(dst); + asm volatile ( + "rep stosl \n" + : "+D"(d), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); + dst += dst_stride; + } +} +#endif // HAS_SETROW_X86 + #ifdef HAS_YUY2TOYROW_SSE2 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { asm volatile ( @@ -2998,7 +3026,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, #endif // HAS_ARGBUNATTENUATEROW_SSE2 #ifdef HAS_ARGBGRAYROW_SSSE3 -// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R +// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R CONST vec8 kARGBToGray = { 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0 }; @@ -3455,7 +3483,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, // TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2. // Copy ARGB pixels from source image with slope to a row of destination. // Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing -// an error if movq is used. movd %%xmm0,%1 +// an error if movq is used. movd %%xmm0,%1 LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, diff --git a/source/row_win.cc b/source/row_win.cc index a02f002e3..e3b01f27f 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -18,6 +18,7 @@ extern "C" { // This module is for Visual C x86. #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) +// TODO(fbarchard): I420ToRGB24, I420ToRAW #ifdef HAS_ARGBTOYROW_SSSE3 // Constants for ARGB. @@ -2521,6 +2522,54 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) { } #endif // HAS_COPYROW_X86 +#ifdef HAS_SETROW_X86 +// SetRow8 writes 'count' bytes using a 32 bit value repeated. +__declspec(naked) __declspec(align(16)) +void SetRow8_X86(uint8* dst, uint32 v32, int count) { + __asm { + mov edx, edi + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v32 + mov ecx, [esp + 12] // count + shr ecx, 2 + rep stosd + mov edi, edx + ret + } +} + +// SetRow32 writes 'count' words using a 32 bit value repeated. +__declspec(naked) __declspec(align(16)) +void SetRows32_X86(uint8* dst, uint32 v32, int width, + int dst_stride, int height) { + __asm { + push esi + push edi + push ebp + mov edi, [esp + 12 + 4] // dst + mov eax, [esp + 12 + 8] // v32 + mov ebp, [esp + 12 + 12] // width + mov edx, [esp + 12 + 16] // dst_stride + mov esi, [esp + 12 + 20] // height + lea ecx, [ebp * 4] + sub edx, ecx // stride - width * 4 + + align 16 + convertloop: + mov ecx, ebp + rep stosd + add edi, edx + sub esi, 1 + jg convertloop + + pop ebp + pop edi + pop esi + ret + } +} +#endif // HAS_SETROW_X86 + #ifdef HAS_YUY2TOYROW_SSE2 __declspec(naked) __declspec(align(16)) void YUY2ToYRow_SSE2(const uint8* src_yuy2, @@ -3497,7 +3546,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 // Tranform 8 ARGB pixels (32 bytes) with color matrix. // Same as Sepia except matrix is provided. -// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R +// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. __declspec(naked) __declspec(align(16)) void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, diff --git a/source/scale.cc b/source/scale.cc index c3458300b..38910c91a 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -54,514 +54,49 @@ void SetUseReferenceImpl(bool use) { #define HAS_SCALEROWDOWN2_NEON // Note - not static due to reuse in convert for 444 to 420. void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width) { - asm volatile ( - "1: \n" - // load even pixels into q0, odd into q1 - "vld2.u8 {q0,q1}, [%0]! \n" - "vst1.u8 {q0}, [%1]! \n" // store even pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1" // Clobber List - ); -} + uint8* dst, int dst_width); void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %0 \n" - "1: \n" - "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post inc - "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post inc - "vpaddl.u8 q0, q0 \n" // row 1 add adjacent - "vpaddl.u8 q1, q1 \n" - // row 2 add adjacent, add row 1 to row 2 - "vpadal.u8 q0, q2 \n" - "vpadal.u8 q1, q3 \n" - "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack - "vrshrn.u16 d1, q1, #2 \n" - "vst1.u8 {q0}, [%2]! \n" - "subs %3, %3, #16 \n" // 16 processed per loop - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "q0", "q1", "q2", "q3" // Clobber List - ); -} + uint8* dst, int dst_width); #define HAS_SCALEROWDOWN4_NEON -static void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "1: \n" - "vld2.u8 {d0, d1}, [%0]! \n" - "vtrn.u8 d1, d0 \n" - "vshrn.u16 d0, q0, #8 \n" - "vst1.u32 {d0[1]}, [%1]! \n" - - "subs %2, #4 \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1", "memory", "cc" - ); -} - -static void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "add r4, %0, %3 \n" - "add r5, r4, %3 \n" - "add %3, r5, %3 \n" - "1: \n" - "vld1.u8 {q0}, [%0]! \n" // load up 16x4 - "vld1.u8 {q1}, [r4]! \n" - "vld1.u8 {q2}, [r5]! \n" - "vld1.u8 {q3}, [%3]! \n" - - "vpaddl.u8 q0, q0 \n" - "vpadal.u8 q0, q1 \n" - "vpadal.u8 q0, q2 \n" - "vpadal.u8 q0, q3 \n" - - "vpaddl.u16 q0, q0 \n" - - "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding - - "vmovn.u16 d0, q0 \n" - "vst1.u32 {d0[0]}, [%1]! \n" - - "subs %2, #4 \n" - "bgt 1b \n" - - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 - : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc" - ); -} +void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); #define HAS_SCALEROWDOWN34_NEON -// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// Down scale from 4 to 3 pixels. Use the neon multilane read/write // to load up the every 4th pixel into a 4 different registers. // Point samples 32 pixels to 24 pixels. -static void ScaleRowDown34_NEON(const uint8* src_ptr, - ptrdiff_t /* src_stride */, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "1: \n" - "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vmov d2, d3 \n" // order d0, d1, d2 - "vst3.u8 {d0, d1, d2}, [%1]! \n" - "subs %2, #24 \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "d0", "d1", "d2", "d3", "memory", "cc" - ); -} - -static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" - "1: \n" - "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - - // filter src line 0 with src line 1 - // expand chars to shorts to allow for room - // when adding lines together - "vmovl.u8 q8, d4 \n" - "vmovl.u8 q9, d5 \n" - "vmovl.u8 q10, d6 \n" - "vmovl.u8 q11, d7 \n" - - // 3 * line_0 + line_1 - "vmlal.u8 q8, d0, d24 \n" - "vmlal.u8 q9, d1, d24 \n" - "vmlal.u8 q10, d2, d24 \n" - "vmlal.u8 q11, d3, d24 \n" - - // (3 * line_0 + line_1) >> 2 - "vqrshrn.u16 d0, q8, #2 \n" - "vqrshrn.u16 d1, q9, #2 \n" - "vqrshrn.u16 d2, q10, #2 \n" - "vqrshrn.u16 d3, q11, #2 \n" - - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q8, d1 \n" - "vmlal.u8 q8, d0, d24 \n" - "vqrshrn.u16 d0, q8, #2 \n" - - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" - - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q8, d2 \n" - "vmlal.u8 q8, d3, d24 \n" - "vqrshrn.u16 d2, q8, #2 \n" - - "vst3.u8 {d0, d1, d2}, [%1]! \n" - - "subs %2, #24 \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" - ); -} - -static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" - "1: \n" - "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - - // average src line 0 with src line 1 - "vrhadd.u8 q0, q0, q2 \n" - "vrhadd.u8 q1, q1, q3 \n" - - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q3, d1 \n" - "vmlal.u8 q3, d0, d24 \n" - "vqrshrn.u16 d0, q3, #2 \n" - - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" - - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q3, d2 \n" - "vmlal.u8 q3, d3, d24 \n" - "vqrshrn.u16 d2, q3, #2 \n" - - "vst3.u8 {d0, d1, d2}, [%1]! \n" - - "subs %2, #24 \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" - ); -} +void ScaleRowDown34_NEON(const uint8* src_ptr, + ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); #define HAS_SCALEROWDOWN38_NEON -const uvec8 kShuf38 = - { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; -const uvec8 kShuf38_2 = - { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; -const vec16 kMult38_Div6 = - { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, - 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; -const vec16 kMult38_Div9 = - { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, - 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; - // 32 -> 12 -static void ScaleRowDown38_NEON(const uint8* src_ptr, - ptrdiff_t /* src_stride */, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vld1.u8 {q3}, [%3] \n" - "1: \n" - "vld1.u8 {d0, d1, d2, d3}, [%0]! \n" - "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" - "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" - "vst1.u8 {d4}, [%1]! \n" - "vst1.u32 {d5[0]}, [%1]! \n" - "subs %2, #12 \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(&kShuf38) // %3 - : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" - ); -} - +void ScaleRowDown38_NEON(const uint8* src_ptr, + ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width); // 32x3 -> 12x1 -static void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vld1.u16 {q13}, [%4] \n" - "vld1.u8 {q14}, [%5] \n" - "vld1.u8 {q15}, [%6] \n" - "add r4, %0, %3, lsl #1 \n" - "add %3, %0 \n" - "1: \n" - - // d0 = 00 40 01 41 02 42 03 43 - // d1 = 10 50 11 51 12 52 13 53 - // d2 = 20 60 21 61 22 62 23 63 - // d3 = 30 70 31 71 32 72 33 73 - "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" - "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" - "vld4.u8 {d16, d17, d18, d19}, [r4]! \n" - - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // d0 = 00 10 01 11 02 12 03 13 - // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" - "vtrn.u8 d16, d17 \n" - - // d2 = 20 30 21 31 22 32 23 33 - // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" - "vtrn.u8 d18, d19 \n" - - // d0 = 00+10 01+11 02+12 03+13 - // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" - "vpaddl.u8 q8, q8 \n" - - // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" - "vpaddl.u8 d19, d19 \n" - - // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 q0, q8 \n" - "vadd.u16 d4, d3, d7 \n" - "vadd.u16 d4, d19 \n" - - // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] - // + s[6 + st * 1] + s[7 + st * 1] - // + s[6 + st * 2] + s[7 + st * 2]) / 6 - "vqrdmulh.s16 q2, q2, q13 \n" - "vmovn.u16 d4, q2 \n" - - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" - "vmovl.u8 q9, d18 \n" - - // combine source lines - "vadd.u16 q1, q3 \n" - "vadd.u16 q1, q9 \n" - - // d4 = xx 20 xx 30 xx 22 xx 32 - // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" - - // d4 = xx 20 xx 21 xx 22 xx 23 - // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" - - // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" - - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "vqrdmulh.s16 q0, q0, q15 \n" - - // Align for table lookup, vtbl requires registers to - // be adjacent - "vmov.u8 d2, d4 \n" - - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - - "vst1.u8 {d3}, [%1]! \n" - "vst1.u32 {d4[0]}, [%1]! \n" - "subs %2, #12 \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : "r"(&kMult38_Div6), // %4 - "r"(&kShuf38_2), // %5 - "r"(&kMult38_Div9) // %6 - : "r4", "q0", "q1", "q2", "q3", "q8", "q9", - "q13", "q14", "q15", "memory", "cc" - ); -} - +void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); // 32x2 -> 12x1 -static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vld1.u16 {q13}, [%4] \n" - "vld1.u8 {q14}, [%5] \n" - "add %3, %0 \n" - "1: \n" - - // d0 = 00 40 01 41 02 42 03 43 - // d1 = 10 50 11 51 12 52 13 53 - // d2 = 20 60 21 61 22 62 23 63 - // d3 = 30 70 31 71 32 72 33 73 - "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" - "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" - - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // d0 = 00 10 01 11 02 12 03 13 - // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" - - // d2 = 20 30 21 31 22 32 23 33 - // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" - - // d0 = 00+10 01+11 02+12 03+13 - // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" - - // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" - - // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 d4, d3, d7 \n" - - // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "vqrshrn.u16 d4, q2, #2 \n" - - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" - - // combine source lines - "vadd.u16 q1, q3 \n" - - // d4 = xx 20 xx 30 xx 22 xx 32 - // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" - - // d4 = xx 20 xx 21 xx 22 xx 23 - // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" - - // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" - - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "vqrdmulh.s16 q0, q0, q13 \n" - - // Align for table lookup, vtbl requires registers to - // be adjacent - "vmov.u8 d2, d4 \n" - - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - - "vst1.u8 {d3}, [%1]! \n" - "vst1.u32 {d4[0]}, [%1]! \n" - "subs %2, #12 \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : "r"(&kMult38_Div6), // %4 - "r"(&kShuf38_2) // %5 - : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" - ); -} - +void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); // 16x2 -> 16x1 #define HAS_SCALEFILTERROWS_NEON -static void ScaleFilterRows_NEON(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction) { - asm volatile ( - "cmp %4, #0 \n" - "beq 2f \n" - "add %2, %1 \n" - "cmp %4, #128 \n" - "beq 3f \n" - - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" - "1: \n" - "vld1.u8 {q0}, [%1]! \n" - "vld1.u8 {q1}, [%2]! \n" - "subs %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" - "vst1.u8 {q0}, [%0]! \n" - "bgt 1b \n" - "b 4f \n" - - "2: \n" - "vld1.u8 {q0}, [%1]! \n" - "subs %3, #16 \n" - "vst1.u8 {q0}, [%0]! \n" - "bgt 2b \n" - "b 4f \n" - - "3: \n" - "vld1.u8 {q0}, [%1]! \n" - "vld1.u8 {q1}, [%2]! \n" - "subs %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vst1.u8 {q0}, [%0]! \n" - "bgt 3b \n" - "4: \n" - "vst1.u8 {d1[7]}, [%0] \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_stride), // %2 - "+r"(dst_width), // %3 - "+r"(source_y_fraction) // %4 - : - : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" - ); -} +void ScaleFilterRows_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction); /** * SSE2 downscalers with interpolation. @@ -1010,7 +545,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, #define HAS_SCALEROWDOWN34_SSSE3 // Point samples 32 pixels to 24 pixels. -// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. // Then shuffled to do the scaling. // Note that movdqa+palign may be better than movdqu. @@ -1049,7 +584,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x2 rectangle to 24x1 -// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. // Then shuffled to do the scaling. // Register usage: @@ -3420,7 +2955,7 @@ int I420Scale(const uint8* src_y, int src_stride_y, dst_halfwidth = dst_width >> 1; } // If caller used height / 2 when computing src_v, it will point into what - // should be the src_u plane. Detect this and reduce halfheight to match. + // should be the src_u plane. Detect this and reduce halfheight to match. int uv_src_plane_size = src_halfwidth * src_halfheight; if ((src_height & 1) && (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) { @@ -3484,7 +3019,7 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, dst_halfwidth = dst_width >> 1; } // If caller used height / 2 when computing src_v, it will point into what - // should be the src_u plane. Detect this and reduce halfheight to match. + // should be the src_u plane. Detect this and reduce halfheight to match. int uv_src_plane_size = src_halfwidth * src_halfheight; if ((src_height & 1) && (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) { diff --git a/source/scale_neon.cc b/source/scale_neon.cc new file mode 100644 index 000000000..a1946f051 --- /dev/null +++ b/source/scale_neon.cc @@ -0,0 +1,534 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon +#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) + +/** + * NEON downscalers with interpolation. + * + * Provided by Fritz Koenig + * + */ + +void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width) { + asm volatile ( + "1: \n" + // load even pixels into q0, odd into q1 + "vld2.u8 {q0,q1}, [%0]! \n" + "vst1.u8 {q0}, [%1]! \n" // store even pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); +} + +void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %0 \n" + "1: \n" + "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post inc + "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post inc + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #2 \n" + "vst1.u8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" // 16 processed per loop + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "1: \n" + "vld2.u8 {d0, d1}, [%0]! \n" + "vtrn.u8 d1, d0 \n" + "vshrn.u16 d0, q0, #8 \n" + "vst1.u32 {d0[1]}, [%1]! \n" + "subs %2, #4 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1", "memory", "cc" + ); +} + +void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "add r4, %0, %3 \n" + "add r5, r4, %3 \n" + "add %3, r5, %3 \n" + "1: \n" + "vld1.u8 {q0}, [%0]! \n" // load up 16x4 + "vld1.u8 {q1}, [r4]! \n" + "vld1.u8 {q2}, [r5]! \n" + "vld1.u8 {q3}, [%3]! \n" + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" + "vpaddl.u16 q0, q0 \n" + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding + "vmovn.u16 d0, q0 \n" + "vst1.u32 {d0[0]}, [%1]! \n" + "subs %2, #4 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(src_stride) // %3 + : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc" + ); +} + +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8* src_ptr, + ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vmov d2, d3 \n" // order d0, d1, d2 + "vst3.u8 {d0, d1, d2}, [%1]! \n" + "subs %2, #24 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "d0", "d1", "d2", "d3", "memory", "cc" + ); +} + +void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" + + // 3 * line_0 + line_1 + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" + + // (3 * line_0 + line_1) >> 2 + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" + + "vst3.u8 {d0, d1, d2}, [%1]! \n" + + "subs %2, #24 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" + ); +} + +void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + + // average src line 0 with src line 1 + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" + + "vst3.u8 {d0, d1, d2}, [%1]! \n" + + "subs %2, #24 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" + ); +} + +#define HAS_SCALEROWDOWN38_NEON +const uvec8 kShuf38 = + { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; +const uvec8 kShuf38_2 = + { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; +const vec16 kMult38_Div6 = + { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; +const vec16 kMult38_Div9 = + { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8* src_ptr, + ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vld1.u8 {q3}, [%3] \n" + "1: \n" + "vld1.u8 {d0, d1, d2, d3}, [%0]! \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + "vst1.u8 {d4}, [%1]! \n" + "vst1.u32 {d5[0]}, [%1]! \n" + "subs %2, #12 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" + ); +} + +// 32x3 -> 12x1 +void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vld1.u16 {q13}, [%4] \n" + "vld1.u8 {q14}, [%5] \n" + "vld1.u8 {q15}, [%6] \n" + "add r4, %0, %3, lsl #1 \n" + "add %3, %0 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" + "vld4.u8 {d16, d17, d18, d19}, [r4]! \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "vqrdmulh.s16 q2, q2, q13 \n" + "vmovn.u16 d4, q2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q15 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + "vst1.u8 {d3}, [%1]! \n" + "vst1.u32 {d4[0]}, [%1]! \n" + "subs %2, #12 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2), // %5 + "r"(&kMult38_Div9) // %6 + : "r4", "q0", "q1", "q2", "q3", "q8", "q9", + "q13", "q14", "q15", "memory", "cc" + ); +} + +// 32x2 -> 12x1 +void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vld1.u16 {q13}, [%4] \n" + "vld1.u8 {q14}, [%5] \n" + "add %3, %0 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "vqrshrn.u16 d4, q2, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q13 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + "vst1.u8 {d3}, [%1]! \n" + "vst1.u32 {d4[0]}, [%1]! \n" + "subs %2, #12 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" + ); +} + +// 16x2 -> 16x1 +void ScaleFilterRows_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + asm volatile ( + "cmp %4, #0 \n" + "beq 2f \n" + "add %2, %1 \n" + "cmp %4, #128 \n" + "beq 3f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + "1: \n" + "vld1.u8 {q0}, [%1]! \n" + "vld1.u8 {q1}, [%2]! \n" + "subs %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.u8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 4f \n" + + "2: \n" + "vld1.u8 {q0}, [%1]! \n" + "subs %3, #16 \n" + "vst1.u8 {q0}, [%0]! \n" + "bgt 2b \n" + "b 4f \n" + + "3: \n" + "vld1.u8 {q0}, [%1]! \n" + "vld1.u8 {q1}, [%2]! \n" + "subs %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.u8 {q0}, [%0]! \n" + "bgt 3b \n" + "4: \n" + "vst1.u8 {d1[7]}, [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction) // %4 + : + : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" + ); +} + +#endif // __ARM_NEON__ + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + diff --git a/unit_test/rotate_argb_test.cc b/unit_test/rotate_argb_test.cc index 222dc2c75..fe8435e12 100644 --- a/unit_test/rotate_argb_test.cc +++ b/unit_test/rotate_argb_test.cc @@ -76,8 +76,8 @@ static int ARGBTestRotate(int src_width, int src_height, printf("filter %d - %8d us C - %8d us OPT\n", mode, static_cast(c_time*1e6), static_cast(opt_time*1e6)); - // C version may be a little off from the optimized. Order of - // operations may introduce rounding somewhere. So do a difference + // C version may be a little off from the optimized. Order of + // operations may introduce rounding somewhere. So do a difference // of the buffers and look to see that the max difference isn't // over 2. int max_diff = 0; diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc index 87a76e467..fef967645 100644 --- a/unit_test/scale_argb_test.cc +++ b/unit_test/scale_argb_test.cc @@ -80,8 +80,8 @@ static int ARGBTestFilter(int src_width, int src_height, printf("filter %d - %8d us C - %8d us OPT\n", f, static_cast(c_time*1e6), static_cast(opt_time*1e6)); - // C version may be a little off from the optimized. Order of - // operations may introduce rounding somewhere. So do a difference + // C version may be a little off from the optimized. Order of + // operations may introduce rounding somewhere. So do a difference // of the buffers and look to see that the max difference isn't // over 2. int max_diff = 0; diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index f204d0642..55b4148d2 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -118,8 +118,8 @@ static int TestFilter(int src_width, int src_height, printf("filter %d - %8d us C - %8d us OPT\n", f, static_cast(c_time*1e6), static_cast(opt_time*1e6)); - // C version may be a little off from the optimized. Order of - // operations may introduce rounding somewhere. So do a difference + // C version may be a little off from the optimized. Order of + // operations may introduce rounding somewhere. So do a difference // of the buffers and look to see that the max difference isn't // over 2. int max_diff = 0;