Move Neon source to its own files.

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/860009

git-svn-id: http://libyuv.googlecode.com/svn/trunk@396 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
fbarchard@google.com 2012-10-09 00:05:29 +00:00
parent 4807dea4e7
commit 64ce0ab544
32 changed files with 1262 additions and 868 deletions

View File

@ -22,8 +22,10 @@ LOCAL_SRC_FILES := \
ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
LOCAL_CFLAGS += -DLIBYUV_NEON LOCAL_CFLAGS += -DLIBYUV_NEON
LOCAL_SRC_FILES += \ LOCAL_SRC_FILES += \
source/compare_neon.cc \
source/rotate_neon.cc.neon \ source/rotate_neon.cc.neon \
source/row_neon.cc.neon source/row_neon.cc.neon \
source/scale_neon.cc
endif endif
LOCAL_C_INCLUDES += $(LOCAL_PATH)/include LOCAL_C_INCLUDES += $(LOCAL_PATH)/include

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 395 Version: 396
License: BSD License: BSD
License File: LICENSE License File: LICENSE

View File

@ -27,12 +27,30 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
int width, int height, int width, int height,
uint32 value); uint32 value);
// Alias.
#define I400ToI400 CopyPlane
// Copy a plane of data (I420 to I400). // Copy a plane of data (I420 to I400).
LIBYUV_API LIBYUV_API
void CopyPlane(const uint8* src_y, int src_stride_y, void CopyPlane(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y, uint8* dst_y, int dst_stride_y,
int width, int height); int width, int height);
// Convert YUY2 to I422.
LIBYUV_API
int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert UYVY to I422.
int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert I420 to I400. (calls CopyPlane ignoring u/v). // Convert I420 to I400. (calls CopyPlane ignoring u/v).
LIBYUV_API LIBYUV_API
int I420ToI400(const uint8* src_y, int src_stride_y, int I420ToI400(const uint8* src_y, int src_stride_y,

View File

@ -66,6 +66,7 @@ extern "C" {
#define HAS_RAWTOARGBROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3
#define HAS_RGB24TOARGBROW_SSSE3 #define HAS_RGB24TOARGBROW_SSSE3
#define HAS_RGB565TOARGBROW_SSE2 #define HAS_RGB565TOARGBROW_SSE2
#define HAS_SETROW_X86
#define HAS_SPLITUV_SSE2 #define HAS_SPLITUV_SSE2
#define HAS_UYVYTOUV422ROW_SSE2 #define HAS_UYVYTOUV422ROW_SSE2
#define HAS_UYVYTOUVROW_SSE2 #define HAS_UYVYTOUVROW_SSE2
@ -76,13 +77,13 @@ extern "C" {
#define HAS_YUY2TOYROW_SSE2 #define HAS_YUY2TOYROW_SSE2
// Effects // Effects
#define HAS_ARGBMIRRORROW_SSSE3
#define HAS_ARGBAFFINEROW_SSE2 #define HAS_ARGBAFFINEROW_SSE2
#define HAS_ARGBATTENUATEROW_SSSE3 #define HAS_ARGBATTENUATEROW_SSSE3
#define HAS_ARGBBLENDROW_SSSE3 #define HAS_ARGBBLENDROW_SSSE3
#define HAS_ARGBCOLORMATRIXROW_SSSE3 #define HAS_ARGBCOLORMATRIXROW_SSSE3
#define HAS_ARGBGRAYROW_SSSE3 #define HAS_ARGBGRAYROW_SSSE3
#define HAS_ARGBINTERPOLATEROW_SSSE3 #define HAS_ARGBINTERPOLATEROW_SSSE3
#define HAS_ARGBMIRRORROW_SSSE3
#define HAS_ARGBQUANTIZEROW_SSE2 #define HAS_ARGBQUANTIZEROW_SSE2
#define HAS_ARGBSEPIAROW_SSSE3 #define HAS_ARGBSEPIAROW_SSSE3
#define HAS_ARGBSHADE_SSE2 #define HAS_ARGBSHADE_SSE2
@ -93,9 +94,9 @@ extern "C" {
// The following are Windows only: // The following are Windows only:
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_ABGRTOARGBROW_SSSE3
#define HAS_ARGBCOLORTABLEROW_X86 #define HAS_ARGBCOLORTABLEROW_X86
#define HAS_I422TORGBAROW_SSSE3 #define HAS_I422TORGBAROW_SSSE3
#define HAS_ABGRTOARGBROW_SSSE3
#define HAS_RGBATOARGBROW_SSSE3 #define HAS_RGBATOARGBROW_SSSE3
#define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOYROW_SSSE3 #define HAS_RGBATOYROW_SSSE3
@ -105,36 +106,42 @@ extern "C" {
#if !defined(YUV_DISABLE_ASM) && \ #if !defined(YUV_DISABLE_ASM) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
!defined(LIBYUV_SSSE3_ONLY) !defined(LIBYUV_SSSE3_ONLY)
#define HAS_MIRRORROW_SSE2
#define HAS_ARGBATTENUATE_SSE2 #define HAS_ARGBATTENUATE_SSE2
#define HAS_ARGBBLENDROW_SSE2 #define HAS_ARGBBLENDROW_SSE2
#define HAS_MIRRORROW_SSE2
#endif #endif
// The following are available on Neon platforms // The following are available on Neon platforms
#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_MIRRORROW_NEON
#define HAS_MIRRORROWUV_NEON
#define HAS_SPLITUV_NEON
#define HAS_COPYROW_NEON #define HAS_COPYROW_NEON
#define HAS_I422TOABGRROW_NEON
#define HAS_I422TOARGBROW_NEON #define HAS_I422TOARGBROW_NEON
#define HAS_I422TOBGRAROW_NEON #define HAS_I422TOBGRAROW_NEON
#define HAS_I422TOABGRROW_NEON #define HAS_I422TORAWROW_NEON
#define HAS_I422TORGB24ROW_NEON
#define HAS_I422TORGBAROW_NEON #define HAS_I422TORGBAROW_NEON
#define HAS_YUY2TOUV422ROW_NEON #define HAS_MIRRORROW_NEON
#define HAS_YUY2TOUVROW_NEON #define HAS_MIRRORROWUV_NEON
#define HAS_YUY2TOYROW_NEON #define HAS_SETROW_NEON
#define HAS_SPLITUV_NEON
#define HAS_UYVYTOUV422ROW_NEON #define HAS_UYVYTOUV422ROW_NEON
#define HAS_UYVYTOUVROW_NEON #define HAS_UYVYTOUVROW_NEON
#define HAS_UYVYTOYROW_NEON #define HAS_UYVYTOYROW_NEON
#define HAS_YUY2TOUV422ROW_NEON
#define HAS_YUY2TOUVROW_NEON
#define HAS_YUY2TOYROW_NEON
// TODO(fbarchard): Hook these up to calling functions. // TODO(fbarchard): Hook these up to calling functions.
#define HAS_ARGBTORGBAROW_NEON
#define HAS_ARGBTORGB24ROW_NEON
#define HAS_ARGBTORAWROW_NEON
#define HAS_ABGRTOARGBROW_NEON #define HAS_ABGRTOARGBROW_NEON
#define HAS_ARGBTORAWROW_NEON
#define HAS_ARGBTORGB24ROW_NEON
#define HAS_ARGBTORGBAROW_NEON
#define HAS_BGRATOARGBROW_NEON #define HAS_BGRATOARGBROW_NEON
#define HAS_RGBATOARGBROW_NEON #define HAS_NV12TOARGBROW_NEON
#define HAS_NV21TOARGBROW_NEON
#define HAS_RAWTOARGBROW_NEON #define HAS_RAWTOARGBROW_NEON
#define HAS_RGB24TOARGBROW_NEON #define HAS_RGB24TOARGBROW_NEON
#define HAS_RGBATOARGBROW_NEON
#endif #endif
#if defined(_MSC_VER) && !defined(__CLR_VER) #if defined(_MSC_VER) && !defined(__CLR_VER)
@ -189,6 +196,24 @@ void I422ToRGBARow_NEON(const uint8* y_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I422ToRGB24Row_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToRAWRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void NV12ToARGBRow_NEON(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
int width);
void NV21ToARGBRow_NEON(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
int width);
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
@ -237,6 +262,15 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count);
void CopyRow_NEON(const uint8* src, uint8* dst, int count); void CopyRow_NEON(const uint8* src, uint8* dst, int count);
void CopyRow_C(const uint8* src, uint8* dst, int count); void CopyRow_C(const uint8* src, uint8* dst, int count);
void SetRow8_X86(uint8* dst, uint32 v32, int count);
void SetRows32_X86(uint8* dst, uint32 v32, int width,
int dst_stride, int height);
void SetRow8_NEON(uint8* dst, uint32 v32, int count);
void SetRows32_NEON(uint8* dst, uint32 v32, int width,
int dst_stride, int height);
void SetRow8_C(uint8* dst, uint32 v32, int count);
void SetRows32_C(uint8* dst, uint32 v32, int width, int dst_stride, int height);
void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
@ -341,6 +375,16 @@ void I422ToRGBARow_C(const uint8* y_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgba_buf, uint8* rgba_buf,
int width); int width);
void I422ToRGB24Row_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb24_buf,
int width);
void I422ToRAWRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* raw_buf,
int width);
void YToARGBRow_C(const uint8* y_buf, void YToARGBRow_C(const uint8* y_buf,
uint8* rgb_buf, uint8* rgb_buf,
@ -517,30 +561,44 @@ void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void RGBAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, void RGBAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void I422ToARGBRow_Any_NEON(const uint8* y_buf, void I422ToARGBRow_Any_NEON(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I422ToBGRARow_Any_NEON(const uint8* y_buf, void I422ToBGRARow_Any_NEON(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I422ToABGRRow_Any_NEON(const uint8* y_buf, void I422ToABGRRow_Any_NEON(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I422ToRGBARow_Any_NEON(const uint8* y_buf, void I422ToRGBARow_Any_NEON(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I422ToRGB24Row_Any_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToRAWRow_Any_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void NV12ToARGBRow_Any_NEON(const uint8* y_buf,
const uint8* uv_buf,
uint8* argb_buf,
int width);
void NV21ToARGBRow_Any_NEON(const uint8* y_buf,
const uint8* uv_buf,
uint8* argb_buf,
int width);
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
@ -671,4 +729,3 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
#endif // INCLUDE_LIBYUV_ROW_H_ NOLINT #endif // INCLUDE_LIBYUV_ROW_H_ NOLINT

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 395 #define LIBYUV_VERSION 396
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT

View File

@ -64,6 +64,7 @@
# sources. # sources.
'source/compare.cc', 'source/compare.cc',
'source/compare_neon.cc',
'source/convert.cc', 'source/convert.cc',
'source/convert_argb.cc', 'source/convert_argb.cc',
'source/convert_from.cc', 'source/convert_from.cc',
@ -79,6 +80,7 @@
'source/row_posix.cc', 'source/row_posix.cc',
'source/row_win.cc', 'source/row_win.cc',
'source/scale.cc', 'source/scale.cc',
'source/scale_neon.cc',
'source/scale_argb.cc', 'source/scale_argb.cc',
'source/video_common.cc', 'source/video_common.cc',
], ],

View File

@ -244,44 +244,10 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
return seed; return seed;
} }
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_SUMSQUAREERROR_NEON #define HAS_SUMSQUAREERROR_NEON
static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
int count) {
volatile uint32 sse;
asm volatile (
"vmov.u8 q7, #0 \n"
"vmov.u8 q9, #0 \n"
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
"1: \n"
"vld1.u8 {q0}, [%0]! \n"
"vld1.u8 {q1}, [%1]! \n"
"vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n"
"vmlal.s16 q7, d4, d4 \n"
"vmlal.s16 q8, d6, d6 \n"
"vmlal.s16 q8, d5, d5 \n"
"vmlal.s16 q10, d7, d7 \n"
"subs %2, %2, #16 \n"
"bgt 1b \n"
"vadd.u32 q7, q7, q8 \n"
"vadd.u32 q9, q9, q10 \n"
"vadd.u32 q10, q7, q9 \n"
"vpaddl.u32 q1, q10 \n"
"vadd.u64 d0, d2, d3 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
return sse;
}
#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_SUMSQUAREERROR_SSE2 #define HAS_SUMSQUAREERROR_SSE2

62
source/compare_neon.cc Normal file
View File

@ -0,0 +1,62 @@
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile (
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
"vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n"
".p2align 2 \n"
"1: \n"
"vld1.u8 {q0}, [%0]! \n"
"vld1.u8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n"
"vmlal.s16 q8, d4, d4 \n"
"vmlal.s16 q9, d6, d6 \n"
"vmlal.s16 q10, d5, d5 \n"
"vmlal.s16 q11, d7, d7 \n"
"bgt 1b \n"
"vadd.u32 q8, q8, q9 \n"
"vadd.u32 q10, q10, q11 \n"
"vadd.u32 q11, q8, q10 \n"
"vpaddl.u32 q1, q11 \n"
"vadd.u64 d0, d2, d3 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
return sse;
}
#endif // __ARM_NEON__
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@ -62,6 +62,7 @@ int I420Copy(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
// Move to row_win etc.
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_HALFROW_SSE2 #define HAS_HALFROW_SSE2
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
@ -188,7 +189,7 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
// Blends 32x2 pixels to 16x1 // Blends 32x2 pixels to 16x1
// source in scale.cc // source in scale.cc
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_SCALEROWDOWN2_NEON #define HAS_SCALEROWDOWN2_NEON
void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width); uint8* dst, int dst_width);

View File

@ -556,6 +556,14 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
} }
} }
#endif #endif
#if defined(HAS_NV12TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
NV12ToARGBRow = NV12ToARGBRow_NEON;
}
}
#endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
NV12ToARGBRow(src_y, src_uv, dst_argb, width); NV12ToARGBRow(src_y, src_uv, dst_argb, width);
@ -571,10 +579,10 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
// Convert NV21 to ARGB. // Convert NV21 to ARGB.
LIBYUV_API LIBYUV_API
int NV21ToARGB(const uint8* src_y, int src_stride_y, int NV21ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_vu, int src_stride_vu, const uint8* src_uv, int src_stride_uv,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
if (!src_y || !src_vu || !dst_argb || if (!src_y || !src_uv || !dst_argb ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
return -1; return -1;
} }
@ -585,7 +593,7 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
void (*NV21ToARGBRow)(const uint8* y_buf, void (*NV21ToARGBRow)(const uint8* y_buf,
const uint8* vu_buf, const uint8* uv_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) = NV21ToARGBRow_C; int width) = NV21ToARGBRow_C;
#if defined(HAS_NV21TOARGBROW_SSSE3) #if defined(HAS_NV21TOARGBROW_SSSE3)
@ -599,13 +607,21 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
} }
} }
#endif #endif
#if defined(HAS_NV21TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
NV21ToARGBRow = NV21ToARGBRow_NEON;
}
}
#endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
NV21ToARGBRow(src_y, src_vu, dst_argb, width); NV21ToARGBRow(src_y, src_uv, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
if (y & 1) { if (y & 1) {
src_vu += src_stride_vu; src_uv += src_stride_uv;
} }
} }
return 0; return 0;

View File

@ -895,68 +895,50 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y,
} }
// Convert I420 to RGB24. // Convert I420 to RGB24.
// TODO(fbarchard): One step I420ToRGB24Row_NEON.
LIBYUV_API LIBYUV_API
int I420ToRGB24(const uint8* src_y, int src_stride_y, int I420ToRGB24(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb, uint8* dst_rgb24, int dst_stride_rgb24,
int width, int height) { int width, int height) {
if (!src_y || !src_u || !src_v || if (!src_y || !src_u || !src_v ||
!dst_argb || !dst_rgb24 ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
return -1; return -1;
} }
// Negative height means invert the image. // Negative height means invert the image.
if (height < 0) { if (height < 0) {
height = -height; height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
dst_stride_argb = -dst_stride_argb; dst_stride_rgb24 = -dst_stride_rgb24;
} }
void (*I422ToARGBRow)(const uint8* y_buf, void (*I422ToRGB24Row)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) = I422ToARGBRow_C; int width) = I422ToRGB24Row_C;
#if defined(HAS_I422TOARGBROW_NEON) #if defined(HAS_I422TORGB24ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_NEON; I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
} if (IS_ALIGNED(width, 16)) {
#elif defined(HAS_I422TOARGBROW_SSSE3) I422ToRGB24Row = I422ToRGB24Row_NEON;
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
#endif
SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToRGB24Row_C;
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (width * 3 <= kMaxStride) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
}
if (IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
} }
} }
#endif #elif defined(HAS_I422TORGB24ROW_SSSE3)
#if defined(HAS_ARGBTORGB24ROW_NEON) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) { I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
if (width * 3 <= kMaxStride) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
}
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToRGB24Row = ARGBToRGB24Row_NEON; I422ToRGB24Row = I422ToRGB24Row_Unaligned_SSSE3;
if (IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
I422ToRGB24Row = I422ToRGB24Row_SSSE3;
}
} }
} }
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, row, width); I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
ARGBToRGB24Row(row, dst_argb, width); dst_rgb24 += dst_stride_rgb24;
dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
if (y & 1) { if (y & 1) {
src_u += src_stride_u; src_u += src_stride_u;
@ -967,67 +949,50 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
} }
// Convert I420 to RAW. // Convert I420 to RAW.
// TODO(fbarchard): One step I420ToRAWRow_NEON.
LIBYUV_API LIBYUV_API
int I420ToRAW(const uint8* src_y, int src_stride_y, int I420ToRAW(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb, uint8* dst_raw, int dst_stride_raw,
int width, int height) { int width, int height) {
if (!src_y || !src_u || !src_v || if (!src_y || !src_u || !src_v ||
!dst_argb || !dst_raw ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
return -1; return -1;
} }
// Negative height means invert the image. // Negative height means invert the image.
if (height < 0) { if (height < 0) {
height = -height; height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_raw = dst_raw + (height - 1) * dst_stride_raw;
dst_stride_argb = -dst_stride_argb; dst_stride_raw = -dst_stride_raw;
} }
void (*I422ToARGBRow)(const uint8* y_buf, void (*I422ToRAWRow)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) = I422ToARGBRow_C; int width) = I422ToRAWRow_C;
#if defined(HAS_I422TOARGBROW_NEON) #if defined(HAS_I422TORAWROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_NEON; I422ToRAWRow = I422ToRAWRow_Any_NEON;
} if (IS_ALIGNED(width, 16)) {
#elif defined(HAS_I422TOARGBROW_SSSE3) I422ToRAWRow = I422ToRAWRow_NEON;
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
#endif
SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToRAWRow_C;
#if defined(HAS_ARGBTORAWROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (width * 3 <= kMaxStride) {
ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
}
if (IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBToRAWRow = ARGBToRAWRow_SSSE3;
} }
} }
#elif defined(HAS_ARGBTORAWROW_NEON) #elif defined(HAS_I422TORAWROW_SSSE3)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (width * 3 <= kMaxStride) { I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
}
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToRAWRow = ARGBToRAWRow_NEON; I422ToRAWRow = I422ToRAWRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
I422ToRAWRow = I422ToRAWRow_SSSE3;
}
} }
} }
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, row, width); I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
ARGBToRAWRow(row, dst_argb, width); dst_raw += dst_stride_raw;
dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
if (y & 1) { if (y & 1) {
src_u += src_stride_u; src_u += src_stride_u;

View File

@ -29,7 +29,7 @@
// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux. // TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
static __inline void __cpuid(int cpu_info[4], int info_type) { static __inline void __cpuid(int cpu_info[4], int info_type) {
asm volatile ( asm volatile ( // NOLINT
"mov %%ebx, %%edi \n" "mov %%ebx, %%edi \n"
"cpuid \n" "cpuid \n"
"xchg %%edi, %%ebx \n" "xchg %%edi, %%ebx \n"
@ -38,7 +38,7 @@ static __inline void __cpuid(int cpu_info[4], int info_type) {
} }
#elif defined(__i386__) || defined(__x86_64__) #elif defined(__i386__) || defined(__x86_64__)
static __inline void __cpuid(int cpu_info[4], int info_type) { static __inline void __cpuid(int cpu_info[4], int info_type) {
asm volatile ( asm volatile ( // NOLINT
"cpuid \n" "cpuid \n"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type)); : "a"(info_type));
@ -85,7 +85,7 @@ static uint32 XGetBV(unsigned int xcr) {
#define HAS_XGETBV #define HAS_XGETBV
static uint32 XGetBV(unsigned int xcr) { static uint32 XGetBV(unsigned int xcr) {
uint32 xcr_feature_mask; uint32 xcr_feature_mask;
asm volatile ( asm volatile ( // NOLINT
".byte 0x0f, 0x01, 0xd0\n" ".byte 0x0f, 0x01, 0xd0\n"
: "=a"(xcr_feature_mask) : "=a"(xcr_feature_mask)
: "c"(xcr) : "c"(xcr)
@ -124,6 +124,18 @@ int ArmCpuCaps(const char* cpuinfo_name) {
LIBYUV_API LIBYUV_API
int cpu_info_ = 0; int cpu_info_ = 0;
// Test environment variable for disabling CPU features. Any non-zero value
// to disable. Zero ignored to make it easy to set the variable on/off.
static bool TestEnv(const char* name) {
const char* var = getenv(name);
if (var) {
if (var[0] != '0') {
return true;
}
}
return false;
}
LIBYUV_API LIBYUV_API
int InitCpuFlags(void) { int InitCpuFlags(void) {
#if !defined(__CLR_VER) && defined(CPU_X86) #if !defined(__CLR_VER) && defined(CPU_X86)
@ -144,34 +156,33 @@ int InitCpuFlags(void) {
} }
} }
#endif #endif
// environment variable overrides for testing. // environment variable overrides for testing.
if (getenv("LIBYUV_DISABLE_X86")) { if (TestEnv("LIBYUV_DISABLE_X86")) {
cpu_info_ &= ~kCpuHasX86; cpu_info_ &= ~kCpuHasX86;
} }
if (getenv("LIBYUV_DISABLE_SSE2")) { if (TestEnv("LIBYUV_DISABLE_SSE2")) {
cpu_info_ &= ~kCpuHasSSE2; cpu_info_ &= ~kCpuHasSSE2;
} }
if (getenv("LIBYUV_DISABLE_SSSE3")) { if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
cpu_info_ &= ~kCpuHasSSSE3; cpu_info_ &= ~kCpuHasSSSE3;
} }
if (getenv("LIBYUV_DISABLE_SSE41")) { if (TestEnv("LIBYUV_DISABLE_SSE41")) {
cpu_info_ &= ~kCpuHasSSE41; cpu_info_ &= ~kCpuHasSSE41;
} }
if (getenv("LIBYUV_DISABLE_SSE42")) { if (TestEnv("LIBYUV_DISABLE_SSE42")) {
cpu_info_ &= ~kCpuHasSSE42; cpu_info_ &= ~kCpuHasSSE42;
} }
if (getenv("LIBYUV_DISABLE_AVX")) { if (TestEnv("LIBYUV_DISABLE_AVX")) {
cpu_info_ &= ~kCpuHasAVX; cpu_info_ &= ~kCpuHasAVX;
} }
if (getenv("LIBYUV_DISABLE_AVX2")) { if (TestEnv("LIBYUV_DISABLE_AVX2")) {
cpu_info_ &= ~kCpuHasAVX2; cpu_info_ &= ~kCpuHasAVX2;
} }
if (getenv("LIBYUV_DISABLE_ASM")) { if (TestEnv("LIBYUV_DISABLE_ASM")) {
cpu_info_ = kCpuInitialized; cpu_info_ = kCpuInitialized;
} }
#elif defined(__arm__) #elif defined(__arm__)
#if defined(__linux__) && defined(__ARM_NEON__) #if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
// linux arm parse text file for neon detect. // linux arm parse text file for neon detect.
cpu_info_ = ArmCpuCaps("/proc/cpuinfo"); cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
#elif defined(__ARM_NEON__) #elif defined(__ARM_NEON__)
@ -181,10 +192,10 @@ int InitCpuFlags(void) {
cpu_info_ = kCpuHasNEON; cpu_info_ = kCpuHasNEON;
#endif #endif
cpu_info_ |= kCpuInitialized | kCpuHasARM; cpu_info_ |= kCpuInitialized | kCpuHasARM;
if (getenv("LIBYUV_DISABLE_NEON")) { if (TestEnv("LIBYUV_DISABLE_NEON")) {
cpu_info_ &= ~kCpuHasNEON; cpu_info_ &= ~kCpuHasNEON;
} }
if (getenv("LIBYUV_DISABLE_ASM")) { if (TestEnv("LIBYUV_DISABLE_ASM")) {
cpu_info_ = kCpuInitialized; cpu_info_ = kCpuInitialized;
} }
#endif // __arm__ #endif // __arm__

View File

@ -10,6 +10,7 @@
#include "libyuv/mjpeg_decoder.h" #include "libyuv/mjpeg_decoder.h"
#ifdef HAVE_JPEG
// Must be included before jpeglib // Must be included before jpeglib
#include <assert.h> #include <assert.h>
#ifndef __CLR_VER #ifndef __CLR_VER
@ -578,3 +579,5 @@ JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
} }
} // namespace libyuv } // namespace libyuv
#endif // HAVE_JPEG

View File

@ -105,6 +105,130 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
} }
} }
// Convert YUY2 to I422.
LIBYUV_API
int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
src_stride_yuy2 = -src_stride_yuy2;
}
void (*YUY2ToUV422Row)(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
void (*YUY2ToYRow)(const uint8* src_yuy2,
uint8* dst_y, int pix);
YUY2ToYRow = YUY2ToYRow_C;
YUY2ToUV422Row = YUY2ToUV422Row_C;
#if defined(HAS_YUY2TOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
if (width > 16) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
YUY2ToYRow = YUY2ToYRow_Any_SSE2;
}
if (IS_ALIGNED(width, 16)) {
YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
YUY2ToYRow = YUY2ToYRow_SSE2;
}
}
}
}
#elif defined(HAS_YUY2TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
if (width > 8) {
YUY2ToYRow = YUY2ToYRow_Any_NEON;
if (width > 16) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
}
}
if (IS_ALIGNED(width, 16)) {
YUY2ToYRow = YUY2ToYRow_NEON;
YUY2ToUV422Row = YUY2ToUV422Row_NEON;
}
}
#endif
for (int y = 0; y < height; ++y) {
YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
YUY2ToYRow(src_yuy2, dst_y, width);
src_yuy2 += src_stride_yuy2;
dst_y += dst_stride_y;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
return 0;
}
// Convert UYVY to I422.
LIBYUV_API
int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
src_stride_uyvy = -src_stride_uyvy;
}
void (*UYVYToUV422Row)(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
void (*UYVYToYRow)(const uint8* src_uyvy,
uint8* dst_y, int pix);
UYVYToYRow = UYVYToYRow_C;
UYVYToUV422Row = UYVYToUV422Row_C;
#if defined(HAS_UYVYTOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
if (width > 16) {
UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
UYVYToYRow = UYVYToYRow_Any_SSE2;
}
if (IS_ALIGNED(width, 16)) {
UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
UYVYToUV422Row = UYVYToUV422Row_SSE2;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
UYVYToYRow = UYVYToYRow_SSE2;
}
}
}
}
#elif defined(HAS_UYVYTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
if (width > 8) {
UYVYToYRow = UYVYToYRow_Any_NEON;
if (width > 16) {
UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
}
}
if (IS_ALIGNED(width, 16)) {
UYVYToYRow = UYVYToYRow_NEON;
UYVYToUV422Row = UYVYToUV422Row_NEON;
}
}
#endif
for (int y = 0; y < height; ++y) {
UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
UYVYToYRow(src_uyvy, dst_y, width);
src_uyvy += src_stride_uyvy;
dst_y += dst_stride_y;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
return 0;
}
// Mirror I420 with optional flipping // Mirror I420 with optional flipping
LIBYUV_API LIBYUV_API
int I420Mirror(const uint8* src_y, int src_stride_y, int I420Mirror(const uint8* src_y, int src_stride_y,
@ -721,6 +845,11 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
NV12ToARGBRow = NV12ToARGBRow_SSSE3; NV12ToARGBRow = NV12ToARGBRow_SSSE3;
} }
#endif #endif
#if defined(HAS_NV12TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width * 4 <= kMaxStride) {
NV12ToARGBRow = NV12ToARGBRow_NEON;
}
#endif
SIMD_ALIGNED(uint8 row[kMaxStride]); SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
@ -789,129 +918,6 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
// SetRow8 writes 'count' bytes using a 32 bit value repeated
// SetRow32 writes 'count' words using a 32 bit value repeated
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#define HAS_SETROW_NEON
static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
asm volatile ( // NOLINT
"vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
"vst1.u32 {q0}, [%0]! \n" // store
"bgt 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
: "r"(v32) // %2
: "q0", "memory", "cc");
}
// TODO(fbarchard): Make fully assembler
static void SetRows32_NEON(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
for (int y = 0; y < height; ++y) {
SetRow8_NEON(dst, v32, width << 2);
dst += dst_stride;
}
}
#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_SETROW_X86
__declspec(naked) __declspec(align(16))
static void SetRow8_X86(uint8* dst, uint32 v32, int count) {
__asm {
mov edx, edi
mov edi, [esp + 4] // dst
mov eax, [esp + 8] // v32
mov ecx, [esp + 12] // count
shr ecx, 2
rep stosd
mov edi, edx
ret
}
}
__declspec(naked) __declspec(align(16))
static void SetRows32_X86(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
__asm {
push esi
push edi
push ebp
mov edi, [esp + 12 + 4] // dst
mov eax, [esp + 12 + 8] // v32
mov ebp, [esp + 12 + 12] // width
mov edx, [esp + 12 + 16] // dst_stride
mov esi, [esp + 12 + 20] // height
lea ecx, [ebp * 4]
sub edx, ecx // stride - width * 4
align 16
convertloop:
mov ecx, ebp
rep stosd
add edi, edx
sub esi, 1
jg convertloop
pop ebp
pop edi
pop esi
ret
}
}
#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
#define HAS_SETROW_X86
static void SetRow8_X86(uint8* dst, uint32 v32, int width) {
size_t width_tmp = static_cast<size_t>(width);
asm volatile ( // NOLINT
"shr $0x2,%1 \n"
"rep stosl \n"
: "+D"(dst), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
: "memory", "cc");
}
static void SetRows32_X86(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
for (int y = 0; y < height; ++y) {
size_t width_tmp = static_cast<size_t>(width);
uint32* d = reinterpret_cast<uint32*>(dst);
asm volatile ( // NOLINT
"rep stosl \n"
: "+D"(d), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
: "memory", "cc");
dst += dst_stride;
}
}
#endif
static void SetRow8_C(uint8* dst, uint32 v8, int count) {
#ifdef _MSC_VER
for (int x = 0; x < count; ++x) {
dst[x] = v8;
}
#else
memset(dst, v8, count);
#endif
}
static void SetRows32_C(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
for (int y = 0; y < height; ++y) {
uint32* d = reinterpret_cast<uint32*>(dst);
for (int x = 0; x < width; ++x) {
d[x] = v32;
}
dst += dst_stride;
}
}
LIBYUV_API LIBYUV_API
void SetPlane(uint8* dst_y, int dst_stride_y, void SetPlane(uint8* dst_y, int dst_stride_y,
int width, int height, int width, int height,
@ -929,13 +935,6 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
SetRow = SetRow8_X86; SetRow = SetRow8_X86;
} }
#endif #endif
#if defined(HAS_SETROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
SetRow = SetRow8_SSE2;
}
#endif
uint32 v32 = value | (value << 8) | (value << 16) | (value << 24); uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
// Set plane // Set plane

View File

@ -459,6 +459,14 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf,
(255u << ashift); (255u << ashift);
} }
static __inline void YuvPixel2(uint8 y, uint8 u, uint8 v,
uint8* b, uint8* g, uint8* r) {
int32 y1 = (static_cast<int32>(y) - 16) * YG;
*b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
*g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
*r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
}
void I444ToARGBRow_C(const uint8* y_buf, void I444ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
@ -492,6 +500,48 @@ void I422ToARGBRow_C(const uint8* y_buf,
} }
} }
void I422ToRGB24Row_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width - 1; x += 2) {
YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
y_buf += 2;
u_buf += 1;
v_buf += 1;
rgb_buf += 6; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
}
}
void I422ToRAWRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width - 1; x += 2) {
YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
y_buf += 2;
u_buf += 1;
v_buf += 1;
rgb_buf += 6; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
}
}
void I411ToARGBRow_C(const uint8* y_buf, void I411ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
@ -671,6 +721,28 @@ void CopyRow_C(const uint8* src, uint8* dst, int count) {
memcpy(dst, src, count); memcpy(dst, src, count);
} }
void SetRow8_C(uint8* dst, uint32 v8, int count) {
#ifdef _MSC_VER
// VC will generate rep stosb.
for (int x = 0; x < count; ++x) {
dst[x] = v8;
}
#else
memset(dst, v8, count);
#endif
}
void SetRows32_C(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
for (int y = 0; y < height; ++y) {
uint32* d = reinterpret_cast<uint32*>(dst);
for (int x = 0; x < width; ++x) {
d[x] = v32;
}
dst += dst_stride;
}
}
// Filter 2 rows of YUY2 UV's (422) into U and V (420). // Filter 2 rows of YUY2 UV's (422) into U and V (420).
void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_u, uint8* dst_v, int width) { uint8* dst_u, uint8* dst_v, int width) {
@ -950,6 +1022,11 @@ Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0)
YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1) YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1) YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
#endif #endif
#ifdef HAS_I422TORGB24ROW_SSSE3
YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_Unaligned_SSSE3, \
I422ToRGB24Row_C, 1)
YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_Unaligned_SSSE3, I422ToRAWRow_C, 1)
#endif
#ifdef HAS_I422TORGBAROW_SSSE3 #ifdef HAS_I422TORGBAROW_SSSE3
YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1) YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1)
#endif #endif
@ -958,6 +1035,10 @@ YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1)
YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1) YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1)
YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1) YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1)
YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1) YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1)
Y2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0)
Y2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0)
YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1)
YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1)
#endif #endif
#undef YANY #undef YANY

View File

@ -101,8 +101,8 @@ void I422ToARGBRow_NEON(const uint8* y_buf,
"+r"(width) // %4 "+r"(width) // %4
: "r"(&kUVToRB), // %5 : "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6 "r"(&kUVToG) // %6
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", : "cc", "memory", "q0", "q1", "q2", "q3",
"q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
#endif // HAS_I422TOARGBROW_NEON #endif // HAS_I422TOARGBROW_NEON
@ -135,8 +135,8 @@ void I422ToBGRARow_NEON(const uint8* y_buf,
"+r"(width) // %4 "+r"(width) // %4
: "r"(&kUVToRB), // %5 : "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6 "r"(&kUVToG) // %6
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", : "cc", "memory", "q0", "q1", "q2", "q3",
"q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
#endif // HAS_I422TOBGRAROW_NEON #endif // HAS_I422TOBGRAROW_NEON
@ -169,8 +169,8 @@ void I422ToABGRRow_NEON(const uint8* y_buf,
"+r"(width) // %4 "+r"(width) // %4
: "r"(&kUVToRB), // %5 : "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6 "r"(&kUVToG) // %6
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", : "cc", "memory", "q0", "q1", "q2", "q3",
"q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
#endif // HAS_I422TOABGRROW_NEON #endif // HAS_I422TOABGRROW_NEON
@ -202,12 +202,77 @@ void I422ToRGBARow_NEON(const uint8* y_buf,
"+r"(width) // %4 "+r"(width) // %4
: "r"(&kUVToRB), // %5 : "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6 "r"(&kUVToG) // %6
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", : "cc", "memory", "q0", "q1", "q2", "q3",
"q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
#endif // HAS_I422TORGBAROW_NEON #endif // HAS_I422TORGBAROW_NEON
#ifdef HAS_I422TORGB24ROW_NEON
void I422ToRGB24Row_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
".p2align 2 \n"
"1: \n"
READYUV422
YUV422TORGB
"subs %4, %4, #8 \n"
"vst3.8 {d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif // HAS_I422TORGB24ROW_NEON
#ifdef HAS_I422TORAWROW_NEON
void I422ToRAWRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
".p2align 2 \n"
"1: \n"
READYUV422
YUV422TORGB
"subs %4, %4, #8 \n"
"vswp.u8 d20, d22 \n"
"vst3.8 {d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif // HAS_I422TORAWROW_NEON
#ifdef HAS_NV12TOARGBROW_NEON #ifdef HAS_NV12TOARGBROW_NEON
void NV12ToARGBRow_NEON(const uint8* y_buf, void NV12ToARGBRow_NEON(const uint8* y_buf,
const uint8* uv_buf, const uint8* uv_buf,
@ -233,8 +298,8 @@ void NV12ToARGBRow_NEON(const uint8* y_buf,
"+r"(width) // %3 "+r"(width) // %3
: "r"(&kUVToRB), // %4 : "r"(&kUVToRB), // %4
"r"(&kUVToG) // %5 "r"(&kUVToG) // %5
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", : "cc", "memory", "q0", "q1", "q2", "q3",
"q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
#endif // HAS_NV12TOARGBROW_NEON #endif // HAS_NV12TOARGBROW_NEON
@ -264,8 +329,8 @@ void NV21ToARGBRow_NEON(const uint8* y_buf,
"+r"(width) // %3 "+r"(width) // %3
: "r"(&kUVToRB), // %4 : "r"(&kUVToRB), // %4
"r"(&kUVToG) // %5 "r"(&kUVToG) // %5
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", : "cc", "memory", "q0", "q1", "q2", "q3",
"q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
#endif // HAS_NV21TOARGBROW_NEON #endif // HAS_NV21TOARGBROW_NEON
@ -312,7 +377,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
#endif // HAS_COPYROW_NEON #endif // HAS_COPYROW_NEON
#ifdef HAS_SETROW_NEON #ifdef HAS_SETROW_NEON
// SetRow8 writes 'count' bytes using a 32 bit value repeated // SetRow8 writes 'count' bytes using a 32 bit value repeated.
void SetRow8_NEON(uint8* dst, uint32 v32, int count) { void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
asm volatile ( // NOLINT asm volatile ( // NOLINT
"vdup.u32 q0, %2 \n" // duplicate 4 ints "vdup.u32 q0, %2 \n" // duplicate 4 ints
@ -327,7 +392,7 @@ void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
} }
// TODO(fbarchard): Make fully assembler // TODO(fbarchard): Make fully assembler
// SetRow32 writes 'count' words using a 32 bit value repeated // SetRow32 writes 'count' words using a 32 bit value repeated.
void SetRows32_NEON(uint8* dst, uint32 v32, int width, void SetRows32_NEON(uint8* dst, uint32 v32, int width,
int dst_stride, int height) { int dst_stride, int height) {
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
@ -358,7 +423,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"sub %1, #16 \n" "sub %1, #16 \n"
// the loop needs to run on blocks of 16. what will be left // the loop needs to run on blocks of 16. what will be left
// over is either a negative number, the residuals that need // over is either a negative number, the residuals that need
// to be done, or 0. if this isn't subtracted off here the // to be done, or 0. If this isn't subtracted off here the
// loop will run one extra time. // loop will run one extra time.
"sub %2, #16 \n" "sub %2, #16 \n"

View File

@ -2143,6 +2143,34 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) {
} }
#endif // HAS_COPYROW_X86 #endif // HAS_COPYROW_X86
#ifdef HAS_SETROW_X86
void SetRow8_X86(uint8* dst, uint32 v32, int width) {
size_t width_tmp = static_cast<size_t>(width);
asm volatile (
"shr $0x2,%1 \n"
"rep stosl \n"
: "+D"(dst), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
: "memory", "cc");
}
void SetRows32_X86(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
for (int y = 0; y < height; ++y) {
size_t width_tmp = static_cast<size_t>(width);
uint32* d = reinterpret_cast<uint32*>(dst);
asm volatile (
"rep stosl \n"
: "+D"(d), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
: "memory", "cc");
dst += dst_stride;
}
}
#endif // HAS_SETROW_X86
#ifdef HAS_YUY2TOYROW_SSE2 #ifdef HAS_YUY2TOYROW_SSE2
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile ( asm volatile (

View File

@ -18,6 +18,7 @@ extern "C" {
// This module is for Visual C x86. // This module is for Visual C x86.
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
// TODO(fbarchard): I420ToRGB24, I420ToRAW
#ifdef HAS_ARGBTOYROW_SSSE3 #ifdef HAS_ARGBTOYROW_SSSE3
// Constants for ARGB. // Constants for ARGB.
@ -2521,6 +2522,54 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) {
} }
#endif // HAS_COPYROW_X86 #endif // HAS_COPYROW_X86
#ifdef HAS_SETROW_X86
// SetRow8 writes 'count' bytes using a 32 bit value repeated.
__declspec(naked) __declspec(align(16))
void SetRow8_X86(uint8* dst, uint32 v32, int count) {
__asm {
mov edx, edi
mov edi, [esp + 4] // dst
mov eax, [esp + 8] // v32
mov ecx, [esp + 12] // count
shr ecx, 2
rep stosd
mov edi, edx
ret
}
}
// SetRow32 writes 'count' words using a 32 bit value repeated.
__declspec(naked) __declspec(align(16))
void SetRows32_X86(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
__asm {
push esi
push edi
push ebp
mov edi, [esp + 12 + 4] // dst
mov eax, [esp + 12 + 8] // v32
mov ebp, [esp + 12 + 12] // width
mov edx, [esp + 12 + 16] // dst_stride
mov esi, [esp + 12 + 20] // height
lea ecx, [ebp * 4]
sub edx, ecx // stride - width * 4
align 16
convertloop:
mov ecx, ebp
rep stosd
add edi, edx
sub esi, 1
jg convertloop
pop ebp
pop edi
pop esi
ret
}
}
#endif // HAS_SETROW_X86
#ifdef HAS_YUY2TOYROW_SSE2 #ifdef HAS_YUY2TOYROW_SSE2
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void YUY2ToYRow_SSE2(const uint8* src_yuy2, void YUY2ToYRow_SSE2(const uint8* src_yuy2,

View File

@ -54,514 +54,49 @@ void SetUseReferenceImpl(bool use) {
#define HAS_SCALEROWDOWN2_NEON #define HAS_SCALEROWDOWN2_NEON
// Note - not static due to reuse in convert for 444 to 420. // Note - not static due to reuse in convert for 444 to 420.
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) { uint8* dst, int dst_width);
asm volatile (
"1: \n"
// load even pixels into q0, odd into q1
"vld2.u8 {q0,q1}, [%0]! \n"
"vst1.u8 {q0}, [%1]! \n" // store even pixels
"subs %2, %2, #16 \n" // 16 processed per loop
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
:
: "q0", "q1" // Clobber List
);
}
void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width);
asm volatile (
// change the stride to row 2 pointer
"add %1, %0 \n"
"1: \n"
"vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post inc
"vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post inc
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent
"vpaddl.u8 q1, q1 \n"
// row 2 add adjacent, add row 1 to row 2
"vpadal.u8 q0, q2 \n"
"vpadal.u8 q1, q3 \n"
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #2 \n"
"vst1.u8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n" // 16 processed per loop
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
"+r"(dst_width) // %3
:
: "q0", "q1", "q2", "q3" // Clobber List
);
}
#define HAS_SCALEROWDOWN4_NEON #define HAS_SCALEROWDOWN4_NEON
static void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width);
asm volatile ( void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"1: \n" uint8* dst_ptr, int dst_width);
"vld2.u8 {d0, d1}, [%0]! \n"
"vtrn.u8 d1, d0 \n"
"vshrn.u16 d0, q0, #8 \n"
"vst1.u32 {d0[1]}, [%1]! \n"
"subs %2, #4 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "q0", "q1", "memory", "cc"
);
}
static void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"add r4, %0, %3 \n"
"add r5, r4, %3 \n"
"add %3, r5, %3 \n"
"1: \n"
"vld1.u8 {q0}, [%0]! \n" // load up 16x4
"vld1.u8 {q1}, [r4]! \n"
"vld1.u8 {q2}, [r5]! \n"
"vld1.u8 {q3}, [%3]! \n"
"vpaddl.u8 q0, q0 \n"
"vpadal.u8 q0, q1 \n"
"vpadal.u8 q0, q2 \n"
"vpadal.u8 q0, q3 \n"
"vpaddl.u16 q0, q0 \n"
"vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
"vmovn.u16 d0, q0 \n"
"vst1.u32 {d0[0]}, [%1]! \n"
"subs %2, #4 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride) // %3
: "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
);
}
#define HAS_SCALEROWDOWN34_NEON #define HAS_SCALEROWDOWN34_NEON
// Down scale from 4 to 3 pixels. Use the neon multilane read/write // Down scale from 4 to 3 pixels. Use the neon multilane read/write
// to load up the every 4th pixel into a 4 different registers. // to load up the every 4th pixel into a 4 different registers.
// Point samples 32 pixels to 24 pixels. // Point samples 32 pixels to 24 pixels.
static void ScaleRowDown34_NEON(const uint8* src_ptr, void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t /* src_stride */, ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width);
asm volatile ( void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vmov d2, d3 \n" // order d0, d1, d2
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"subs %2, #24 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "d0", "d1", "d2", "d3", "memory", "cc"
);
}
static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width);
asm volatile ( void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
// when adding lines together
"vmovl.u8 q8, d4 \n"
"vmovl.u8 q9, d5 \n"
"vmovl.u8 q10, d6 \n"
"vmovl.u8 q11, d7 \n"
// 3 * line_0 + line_1
"vmlal.u8 q8, d0, d24 \n"
"vmlal.u8 q9, d1, d24 \n"
"vmlal.u8 q10, d2, d24 \n"
"vmlal.u8 q11, d3, d24 \n"
// (3 * line_0 + line_1) >> 2
"vqrshrn.u16 d0, q8, #2 \n"
"vqrshrn.u16 d1, q9, #2 \n"
"vqrshrn.u16 d2, q10, #2 \n"
"vqrshrn.u16 d3, q11, #2 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
"vmovl.u8 q8, d1 \n"
"vmlal.u8 q8, d0, d24 \n"
"vqrshrn.u16 d0, q8, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
"vmovl.u8 q8, d2 \n"
"vmlal.u8 q8, d3, d24 \n"
"vqrshrn.u16 d2, q8, #2 \n"
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"subs %2, #24 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
:
: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
);
}
static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width);
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
// average src line 0 with src line 1
"vrhadd.u8 q0, q0, q2 \n"
"vrhadd.u8 q1, q1, q3 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
"vmovl.u8 q3, d1 \n"
"vmlal.u8 q3, d0, d24 \n"
"vqrshrn.u16 d0, q3, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
"vmovl.u8 q3, d2 \n"
"vmlal.u8 q3, d3, d24 \n"
"vqrshrn.u16 d2, q3, #2 \n"
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"subs %2, #24 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
:
: "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
);
}
#define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEROWDOWN38_NEON
const uvec8 kShuf38 =
{ 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
const uvec8 kShuf38_2 =
{ 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
const vec16 kMult38_Div6 =
{ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
const vec16 kMult38_Div9 =
{ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
// 32 -> 12 // 32 -> 12
static void ScaleRowDown38_NEON(const uint8* src_ptr, void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t /* src_stride */, ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width);
asm volatile (
"vld1.u8 {q3}, [%3] \n"
"1: \n"
"vld1.u8 {d0, d1, d2, d3}, [%0]! \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
"vst1.u8 {d4}, [%1]! \n"
"vst1.u32 {d5[0]}, [%1]! \n"
"subs %2, #12 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(&kShuf38) // %3
: "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
);
}
// 32x3 -> 12x1 // 32x3 -> 12x1
static void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width);
asm volatile (
"vld1.u16 {q13}, [%4] \n"
"vld1.u8 {q14}, [%5] \n"
"vld1.u8 {q15}, [%6] \n"
"add r4, %0, %3, lsl #1 \n"
"add %3, %0 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
"vld4.u8 {d16, d17, d18, d19}, [r4]! \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
"vtrn.u8 d0, d1 \n"
"vtrn.u8 d4, d5 \n"
"vtrn.u8 d16, d17 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
"vtrn.u8 d2, d3 \n"
"vtrn.u8 d6, d7 \n"
"vtrn.u8 d18, d19 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
"vpaddl.u8 q0, q0 \n"
"vpaddl.u8 q2, q2 \n"
"vpaddl.u8 q8, q8 \n"
// d3 = 60+70 61+71 62+72 63+73
"vpaddl.u8 d3, d3 \n"
"vpaddl.u8 d7, d7 \n"
"vpaddl.u8 d19, d19 \n"
// combine source lines
"vadd.u16 q0, q2 \n"
"vadd.u16 q0, q8 \n"
"vadd.u16 d4, d3, d7 \n"
"vadd.u16 d4, d19 \n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
// + s[6 + st * 1] + s[7 + st * 1]
// + s[6 + st * 2] + s[7 + st * 2]) / 6
"vqrdmulh.s16 q2, q2, q13 \n"
"vmovn.u16 d4, q2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
// requires expanding from u8 to u16 as the 0,1 and 4,5
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
"vmovl.u8 q1, d2 \n"
"vmovl.u8 q3, d6 \n"
"vmovl.u8 q9, d18 \n"
// combine source lines
"vadd.u16 q1, q3 \n"
"vadd.u16 q1, q9 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
"vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
"vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
"vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
"vqrdmulh.s16 q0, q0, q15 \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
"vmov.u8 d2, d4 \n"
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
"vst1.u8 {d3}, [%1]! \n"
"vst1.u32 {d4[0]}, [%1]! \n"
"subs %2, #12 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
: "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2), // %5
"r"(&kMult38_Div9) // %6
: "r4", "q0", "q1", "q2", "q3", "q8", "q9",
"q13", "q14", "q15", "memory", "cc"
);
}
// 32x2 -> 12x1 // 32x2 -> 12x1
static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width);
asm volatile (
"vld1.u16 {q13}, [%4] \n"
"vld1.u8 {q14}, [%5] \n"
"add %3, %0 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
"vtrn.u8 d0, d1 \n"
"vtrn.u8 d4, d5 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
"vtrn.u8 d2, d3 \n"
"vtrn.u8 d6, d7 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
"vpaddl.u8 q0, q0 \n"
"vpaddl.u8 q2, q2 \n"
// d3 = 60+70 61+71 62+72 63+73
"vpaddl.u8 d3, d3 \n"
"vpaddl.u8 d7, d7 \n"
// combine source lines
"vadd.u16 q0, q2 \n"
"vadd.u16 d4, d3, d7 \n"
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
"vqrshrn.u16 d4, q2, #2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
// requires expanding from u8 to u16 as the 0,1 and 4,5
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
"vmovl.u8 q1, d2 \n"
"vmovl.u8 q3, d6 \n"
// combine source lines
"vadd.u16 q1, q3 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
"vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
"vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
"vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
"vqrdmulh.s16 q0, q0, q13 \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
"vmov.u8 d2, d4 \n"
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
"vst1.u8 {d3}, [%1]! \n"
"vst1.u32 {d4[0]}, [%1]! \n"
"subs %2, #12 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
: "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2) // %5
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
);
}
// 16x2 -> 16x1 // 16x2 -> 16x1
#define HAS_SCALEFILTERROWS_NEON #define HAS_SCALEFILTERROWS_NEON
static void ScaleFilterRows_NEON(uint8* dst_ptr, void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride, const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) { int dst_width, int source_y_fraction);
asm volatile (
"cmp %4, #0 \n"
"beq 2f \n"
"add %2, %1 \n"
"cmp %4, #128 \n"
"beq 3f \n"
"vdup.8 d5, %4 \n"
"rsb %4, #256 \n"
"vdup.8 d4, %4 \n"
"1: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n"
"vmull.u8 q13, d0, d4 \n"
"vmull.u8 q14, d1, d4 \n"
"vmlal.u8 q13, d2, d5 \n"
"vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 1b \n"
"b 4f \n"
"2: \n"
"vld1.u8 {q0}, [%1]! \n"
"subs %3, #16 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 2b \n"
"b 4f \n"
"3: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 3b \n"
"4: \n"
"vst1.u8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
"+r"(dst_width), // %3
"+r"(source_y_fraction) // %4
:
: "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
);
}
/** /**
* SSE2 downscalers with interpolation. * SSE2 downscalers with interpolation.

534
source/scale_neon.cc Normal file
View File

@ -0,0 +1,534 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/basic_types.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for GCC Neon
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
/**
* NEON downscalers with interpolation.
*
* Provided by Fritz Koenig
*
*/
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) {
asm volatile (
"1: \n"
// load even pixels into q0, odd into q1
"vld2.u8 {q0,q1}, [%0]! \n"
"vst1.u8 {q0}, [%1]! \n" // store even pixels
"subs %2, %2, #16 \n" // 16 processed per loop
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
:
: "q0", "q1" // Clobber List
);
}
void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %0 \n"
"1: \n"
"vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post inc
"vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post inc
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent
"vpaddl.u8 q1, q1 \n"
"vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
"vpadal.u8 q1, q3 \n"
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #2 \n"
"vst1.u8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n" // 16 processed per loop
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
"+r"(dst_width) // %3
:
: "q0", "q1", "q2", "q3" // Clobber List
);
}
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) {
asm volatile (
"1: \n"
"vld2.u8 {d0, d1}, [%0]! \n"
"vtrn.u8 d1, d0 \n"
"vshrn.u16 d0, q0, #8 \n"
"vst1.u32 {d0[1]}, [%1]! \n"
"subs %2, #4 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "q0", "q1", "memory", "cc"
);
}
void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"add r4, %0, %3 \n"
"add r5, r4, %3 \n"
"add %3, r5, %3 \n"
"1: \n"
"vld1.u8 {q0}, [%0]! \n" // load up 16x4
"vld1.u8 {q1}, [r4]! \n"
"vld1.u8 {q2}, [r5]! \n"
"vld1.u8 {q3}, [%3]! \n"
"vpaddl.u8 q0, q0 \n"
"vpadal.u8 q0, q1 \n"
"vpadal.u8 q0, q2 \n"
"vpadal.u8 q0, q3 \n"
"vpaddl.u16 q0, q0 \n"
"vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
"vmovn.u16 d0, q0 \n"
"vst1.u32 {d0[0]}, [%1]! \n"
"subs %2, #4 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride) // %3
: "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
);
}
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
// to load up the every 4th pixel into a 4 different registers.
// Point samples 32 pixels to 24 pixels.
void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) {
asm volatile (
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vmov d2, d3 \n" // order d0, d1, d2
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"subs %2, #24 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "d0", "d1", "d2", "d3", "memory", "cc"
);
}
void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
// when adding lines together
"vmovl.u8 q8, d4 \n"
"vmovl.u8 q9, d5 \n"
"vmovl.u8 q10, d6 \n"
"vmovl.u8 q11, d7 \n"
// 3 * line_0 + line_1
"vmlal.u8 q8, d0, d24 \n"
"vmlal.u8 q9, d1, d24 \n"
"vmlal.u8 q10, d2, d24 \n"
"vmlal.u8 q11, d3, d24 \n"
// (3 * line_0 + line_1) >> 2
"vqrshrn.u16 d0, q8, #2 \n"
"vqrshrn.u16 d1, q9, #2 \n"
"vqrshrn.u16 d2, q10, #2 \n"
"vqrshrn.u16 d3, q11, #2 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
"vmovl.u8 q8, d1 \n"
"vmlal.u8 q8, d0, d24 \n"
"vqrshrn.u16 d0, q8, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
"vmovl.u8 q8, d2 \n"
"vmlal.u8 q8, d3, d24 \n"
"vqrshrn.u16 d2, q8, #2 \n"
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"subs %2, #24 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
:
: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
);
}
void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
// average src line 0 with src line 1
"vrhadd.u8 q0, q0, q2 \n"
"vrhadd.u8 q1, q1, q3 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
"vmovl.u8 q3, d1 \n"
"vmlal.u8 q3, d0, d24 \n"
"vqrshrn.u16 d0, q3, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
"vmovl.u8 q3, d2 \n"
"vmlal.u8 q3, d3, d24 \n"
"vqrshrn.u16 d2, q3, #2 \n"
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"subs %2, #24 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
:
: "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
);
}
#define HAS_SCALEROWDOWN38_NEON
const uvec8 kShuf38 =
{ 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
const uvec8 kShuf38_2 =
{ 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
const vec16 kMult38_Div6 =
{ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
const vec16 kMult38_Div9 =
{ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
// 32 -> 12
void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vld1.u8 {q3}, [%3] \n"
"1: \n"
"vld1.u8 {d0, d1, d2, d3}, [%0]! \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
"vst1.u8 {d4}, [%1]! \n"
"vst1.u32 {d5[0]}, [%1]! \n"
"subs %2, #12 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(&kShuf38) // %3
: "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
);
}
// 32x3 -> 12x1
void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vld1.u16 {q13}, [%4] \n"
"vld1.u8 {q14}, [%5] \n"
"vld1.u8 {q15}, [%6] \n"
"add r4, %0, %3, lsl #1 \n"
"add %3, %0 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
"vld4.u8 {d16, d17, d18, d19}, [r4]! \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
"vtrn.u8 d0, d1 \n"
"vtrn.u8 d4, d5 \n"
"vtrn.u8 d16, d17 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
"vtrn.u8 d2, d3 \n"
"vtrn.u8 d6, d7 \n"
"vtrn.u8 d18, d19 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
"vpaddl.u8 q0, q0 \n"
"vpaddl.u8 q2, q2 \n"
"vpaddl.u8 q8, q8 \n"
// d3 = 60+70 61+71 62+72 63+73
"vpaddl.u8 d3, d3 \n"
"vpaddl.u8 d7, d7 \n"
"vpaddl.u8 d19, d19 \n"
// combine source lines
"vadd.u16 q0, q2 \n"
"vadd.u16 q0, q8 \n"
"vadd.u16 d4, d3, d7 \n"
"vadd.u16 d4, d19 \n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
// + s[6 + st * 1] + s[7 + st * 1]
// + s[6 + st * 2] + s[7 + st * 2]) / 6
"vqrdmulh.s16 q2, q2, q13 \n"
"vmovn.u16 d4, q2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
// requires expanding from u8 to u16 as the 0,1 and 4,5
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
"vmovl.u8 q1, d2 \n"
"vmovl.u8 q3, d6 \n"
"vmovl.u8 q9, d18 \n"
// combine source lines
"vadd.u16 q1, q3 \n"
"vadd.u16 q1, q9 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
"vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
"vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
"vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
"vqrdmulh.s16 q0, q0, q15 \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
"vmov.u8 d2, d4 \n"
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
"vst1.u8 {d3}, [%1]! \n"
"vst1.u32 {d4[0]}, [%1]! \n"
"subs %2, #12 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
: "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2), // %5
"r"(&kMult38_Div9) // %6
: "r4", "q0", "q1", "q2", "q3", "q8", "q9",
"q13", "q14", "q15", "memory", "cc"
);
}
// 32x2 -> 12x1
void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vld1.u16 {q13}, [%4] \n"
"vld1.u8 {q14}, [%5] \n"
"add %3, %0 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
"vtrn.u8 d0, d1 \n"
"vtrn.u8 d4, d5 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
"vtrn.u8 d2, d3 \n"
"vtrn.u8 d6, d7 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
"vpaddl.u8 q0, q0 \n"
"vpaddl.u8 q2, q2 \n"
// d3 = 60+70 61+71 62+72 63+73
"vpaddl.u8 d3, d3 \n"
"vpaddl.u8 d7, d7 \n"
// combine source lines
"vadd.u16 q0, q2 \n"
"vadd.u16 d4, d3, d7 \n"
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
"vqrshrn.u16 d4, q2, #2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
// requires expanding from u8 to u16 as the 0,1 and 4,5
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
"vmovl.u8 q1, d2 \n"
"vmovl.u8 q3, d6 \n"
// combine source lines
"vadd.u16 q1, q3 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
"vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
"vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
"vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
"vqrdmulh.s16 q0, q0, q13 \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
"vmov.u8 d2, d4 \n"
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
"vst1.u8 {d3}, [%1]! \n"
"vst1.u32 {d4[0]}, [%1]! \n"
"subs %2, #12 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
: "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2) // %5
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
);
}
// 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
asm volatile (
"cmp %4, #0 \n"
"beq 2f \n"
"add %2, %1 \n"
"cmp %4, #128 \n"
"beq 3f \n"
"vdup.8 d5, %4 \n"
"rsb %4, #256 \n"
"vdup.8 d4, %4 \n"
"1: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n"
"vmull.u8 q13, d0, d4 \n"
"vmull.u8 q14, d1, d4 \n"
"vmlal.u8 q13, d2, d5 \n"
"vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 1b \n"
"b 4f \n"
"2: \n"
"vld1.u8 {q0}, [%1]! \n"
"subs %3, #16 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 2b \n"
"b 4f \n"
"3: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 3b \n"
"4: \n"
"vst1.u8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
"+r"(dst_width), // %3
"+r"(source_y_fraction) // %4
:
: "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
);
}
#endif // __ARM_NEON__
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif