mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Move Neon source to its own files.
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/860009 git-svn-id: http://libyuv.googlecode.com/svn/trunk@396 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
4807dea4e7
commit
64ce0ab544
@ -22,8 +22,10 @@ LOCAL_SRC_FILES := \
|
||||
ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
|
||||
LOCAL_CFLAGS += -DLIBYUV_NEON
|
||||
LOCAL_SRC_FILES += \
|
||||
source/compare_neon.cc \
|
||||
source/rotate_neon.cc.neon \
|
||||
source/row_neon.cc.neon
|
||||
source/row_neon.cc.neon \
|
||||
source/scale_neon.cc
|
||||
endif
|
||||
|
||||
LOCAL_C_INCLUDES += $(LOCAL_PATH)/include
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 395
|
||||
Version: 396
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -18,7 +18,7 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Compute a hash for specified memory. Seed of 5381 recommended.
|
||||
// Compute a hash for specified memory. Seed of 5381 recommended.
|
||||
LIBYUV_API
|
||||
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
|
||||
|
||||
|
||||
@ -73,7 +73,7 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height);
|
||||
|
||||
// Convert NV12 to I420. Also used for NV21.
|
||||
// Convert NV12 to I420. Also used for NV21.
|
||||
LIBYUV_API
|
||||
int NV12ToI420(const uint8* src_y, int src_stride_y,
|
||||
const uint8* src_uv, int src_stride_uv,
|
||||
@ -229,7 +229,7 @@ int MJPGToI420(const uint8* sample, size_t sample_size,
|
||||
// Must be less than or equal to src_width/src_height
|
||||
// Cropping parameters are pre-rotation.
|
||||
// "rotation" can be 0, 90, 180 or 270.
|
||||
// "format" is a fourcc. ie 'I420', 'YUY2'
|
||||
// "format" is a fourcc. ie 'I420', 'YUY2'
|
||||
// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
|
||||
LIBYUV_API
|
||||
int ConvertToI420(const uint8* src_frame, size_t src_size,
|
||||
|
||||
@ -19,7 +19,7 @@
|
||||
|
||||
// TODO(fbarchard): This set of functions should exactly match convert.h
|
||||
// Add missing V210 and Q420.
|
||||
// TODO(fbarchard): Add tests. Create random content of right size and convert
|
||||
// TODO(fbarchard): Add tests. Create random content of right size and convert
|
||||
// with C vs Opt and or to I420 and compare.
|
||||
// TODO(fbarchard): Some of these functions lack parameter setting.
|
||||
|
||||
@ -75,7 +75,7 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height);
|
||||
|
||||
// Convert I400 to ARGB. Reverse of ARGBToI400.
|
||||
// Convert I400 to ARGB. Reverse of ARGBToI400.
|
||||
LIBYUV_API
|
||||
int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
@ -209,7 +209,7 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
|
||||
// Must be less than or equal to src_width/src_height
|
||||
// Cropping parameters are pre-rotation.
|
||||
// "rotation" can be 0, 90, 180 or 270.
|
||||
// "format" is a fourcc. ie 'I420', 'YUY2'
|
||||
// "format" is a fourcc. ie 'I420', 'YUY2'
|
||||
// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
|
||||
LIBYUV_API
|
||||
int ConvertToARGB(const uint8* src_frame, size_t src_size,
|
||||
|
||||
@ -50,7 +50,7 @@ int I420ToI411(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height);
|
||||
|
||||
// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
|
||||
// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
|
||||
LIBYUV_API
|
||||
int I400Copy(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
|
||||
@ -58,7 +58,7 @@ static __inline int TestCpuFlag(int test_flag) {
|
||||
LIBYUV_API
|
||||
void MaskCpuFlags(int enable_flags);
|
||||
|
||||
// Low level cpuid for X86. Returns zeros on other CPUs.
|
||||
// Low level cpuid for X86. Returns zeros on other CPUs.
|
||||
LIBYUV_API
|
||||
void CpuId(int cpu_info[4], int info_type);
|
||||
|
||||
|
||||
@ -27,13 +27,31 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
|
||||
int width, int height,
|
||||
uint32 value);
|
||||
|
||||
// Alias.
|
||||
#define I400ToI400 CopyPlane
|
||||
|
||||
// Copy a plane of data (I420 to I400).
|
||||
LIBYUV_API
|
||||
void CopyPlane(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
int width, int height);
|
||||
|
||||
// Convert I420 to I400. (calls CopyPlane ignoring u/v).
|
||||
// Convert YUY2 to I422.
|
||||
LIBYUV_API
|
||||
int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height);
|
||||
|
||||
// Convert UYVY to I422.
|
||||
int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height);
|
||||
|
||||
// Convert I420 to I400. (calls CopyPlane ignoring u/v).
|
||||
LIBYUV_API
|
||||
int I420ToI400(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
@ -196,7 +214,7 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
|
||||
const uint8* table_argb,
|
||||
int x, int y, int width, int height);
|
||||
|
||||
// Quantize a rectangle of ARGB. Alpha unaffected.
|
||||
// Quantize a rectangle of ARGB. Alpha unaffected.
|
||||
// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
|
||||
// interval_size should be a value between 1 and 255.
|
||||
// interval_offset should be a value between 0 and 255.
|
||||
@ -261,7 +279,7 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
|
||||
int w, int h, int dw, int dh);
|
||||
|
||||
// Computes table of cumulative sum for image where the value is the sum
|
||||
// of all values above and to the left of the entry. Used by ARGBBlur.
|
||||
// of all values above and to the left of the entry. Used by ARGBBlur.
|
||||
LIBYUV_API
|
||||
int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
|
||||
int32* dst_cumsum, int dst_stride32_cumsum,
|
||||
@ -299,7 +317,7 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
|
||||
#define YUV_DISABLE_ASM
|
||||
#endif
|
||||
// Row functions for copying a pixels from a source with a slope to a row
|
||||
// of destination. Useful for scaling, rotation, mirror, texture mapping.
|
||||
// of destination. Useful for scaling, rotation, mirror, texture mapping.
|
||||
LIBYUV_API
|
||||
void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
|
||||
uint8* dst_argb, const float* uv_dudv, int width);
|
||||
|
||||
@ -66,6 +66,7 @@ extern "C" {
|
||||
#define HAS_RAWTOARGBROW_SSSE3
|
||||
#define HAS_RGB24TOARGBROW_SSSE3
|
||||
#define HAS_RGB565TOARGBROW_SSE2
|
||||
#define HAS_SETROW_X86
|
||||
#define HAS_SPLITUV_SSE2
|
||||
#define HAS_UYVYTOUV422ROW_SSE2
|
||||
#define HAS_UYVYTOUVROW_SSE2
|
||||
@ -76,13 +77,13 @@ extern "C" {
|
||||
#define HAS_YUY2TOYROW_SSE2
|
||||
|
||||
// Effects
|
||||
#define HAS_ARGBMIRRORROW_SSSE3
|
||||
#define HAS_ARGBAFFINEROW_SSE2
|
||||
#define HAS_ARGBATTENUATEROW_SSSE3
|
||||
#define HAS_ARGBBLENDROW_SSSE3
|
||||
#define HAS_ARGBCOLORMATRIXROW_SSSE3
|
||||
#define HAS_ARGBGRAYROW_SSSE3
|
||||
#define HAS_ARGBINTERPOLATEROW_SSSE3
|
||||
#define HAS_ARGBMIRRORROW_SSSE3
|
||||
#define HAS_ARGBQUANTIZEROW_SSE2
|
||||
#define HAS_ARGBSEPIAROW_SSSE3
|
||||
#define HAS_ARGBSHADE_SSE2
|
||||
@ -93,9 +94,9 @@ extern "C" {
|
||||
|
||||
// The following are Windows only:
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
#define HAS_ABGRTOARGBROW_SSSE3
|
||||
#define HAS_ARGBCOLORTABLEROW_X86
|
||||
#define HAS_I422TORGBAROW_SSSE3
|
||||
#define HAS_ABGRTOARGBROW_SSSE3
|
||||
#define HAS_RGBATOARGBROW_SSSE3
|
||||
#define HAS_RGBATOUVROW_SSSE3
|
||||
#define HAS_RGBATOYROW_SSSE3
|
||||
@ -105,36 +106,42 @@ extern "C" {
|
||||
#if !defined(YUV_DISABLE_ASM) && \
|
||||
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
|
||||
!defined(LIBYUV_SSSE3_ONLY)
|
||||
#define HAS_MIRRORROW_SSE2
|
||||
#define HAS_ARGBATTENUATE_SSE2
|
||||
#define HAS_ARGBBLENDROW_SSE2
|
||||
#define HAS_MIRRORROW_SSE2
|
||||
#endif
|
||||
|
||||
// The following are available on Neon platforms
|
||||
#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
|
||||
#define HAS_MIRRORROW_NEON
|
||||
#define HAS_MIRRORROWUV_NEON
|
||||
#define HAS_SPLITUV_NEON
|
||||
#define HAS_COPYROW_NEON
|
||||
#define HAS_I422TOABGRROW_NEON
|
||||
#define HAS_I422TOARGBROW_NEON
|
||||
#define HAS_I422TOBGRAROW_NEON
|
||||
#define HAS_I422TOABGRROW_NEON
|
||||
#define HAS_I422TORAWROW_NEON
|
||||
#define HAS_I422TORGB24ROW_NEON
|
||||
#define HAS_I422TORGBAROW_NEON
|
||||
#define HAS_YUY2TOUV422ROW_NEON
|
||||
#define HAS_YUY2TOUVROW_NEON
|
||||
#define HAS_YUY2TOYROW_NEON
|
||||
#define HAS_MIRRORROW_NEON
|
||||
#define HAS_MIRRORROWUV_NEON
|
||||
#define HAS_SETROW_NEON
|
||||
#define HAS_SPLITUV_NEON
|
||||
#define HAS_UYVYTOUV422ROW_NEON
|
||||
#define HAS_UYVYTOUVROW_NEON
|
||||
#define HAS_UYVYTOYROW_NEON
|
||||
#define HAS_YUY2TOUV422ROW_NEON
|
||||
#define HAS_YUY2TOUVROW_NEON
|
||||
#define HAS_YUY2TOYROW_NEON
|
||||
|
||||
// TODO(fbarchard): Hook these up to calling functions.
|
||||
#define HAS_ARGBTORGBAROW_NEON
|
||||
#define HAS_ARGBTORGB24ROW_NEON
|
||||
#define HAS_ARGBTORAWROW_NEON
|
||||
#define HAS_ABGRTOARGBROW_NEON
|
||||
#define HAS_ARGBTORAWROW_NEON
|
||||
#define HAS_ARGBTORGB24ROW_NEON
|
||||
#define HAS_ARGBTORGBAROW_NEON
|
||||
#define HAS_BGRATOARGBROW_NEON
|
||||
#define HAS_RGBATOARGBROW_NEON
|
||||
#define HAS_NV12TOARGBROW_NEON
|
||||
#define HAS_NV21TOARGBROW_NEON
|
||||
#define HAS_RAWTOARGBROW_NEON
|
||||
#define HAS_RGB24TOARGBROW_NEON
|
||||
#define HAS_RGBATOARGBROW_NEON
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__CLR_VER)
|
||||
@ -189,6 +196,24 @@ void I422ToRGBARow_NEON(const uint8* y_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
void I422ToRGB24Row_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
void I422ToRAWRow_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
void NV12ToARGBRow_NEON(const uint8* y_buf,
|
||||
const uint8* uv_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
void NV21ToARGBRow_NEON(const uint8* y_buf,
|
||||
const uint8* uv_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
@ -237,6 +262,15 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_NEON(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_C(const uint8* src, uint8* dst, int count);
|
||||
|
||||
void SetRow8_X86(uint8* dst, uint32 v32, int count);
|
||||
void SetRows32_X86(uint8* dst, uint32 v32, int width,
|
||||
int dst_stride, int height);
|
||||
void SetRow8_NEON(uint8* dst, uint32 v32, int count);
|
||||
void SetRows32_NEON(uint8* dst, uint32 v32, int width,
|
||||
int dst_stride, int height);
|
||||
void SetRow8_C(uint8* dst, uint32 v32, int count);
|
||||
void SetRows32_C(uint8* dst, uint32 v32, int width, int dst_stride, int height);
|
||||
|
||||
void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
@ -341,6 +375,16 @@ void I422ToRGBARow_C(const uint8* y_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgba_buf,
|
||||
int width);
|
||||
void I422ToRGB24Row_C(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb24_buf,
|
||||
int width);
|
||||
void I422ToRAWRow_C(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* raw_buf,
|
||||
int width);
|
||||
|
||||
void YToARGBRow_C(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
@ -517,30 +561,44 @@ void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void RGBAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
|
||||
void I422ToARGBRow_Any_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void I422ToBGRARow_Any_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void I422ToABGRRow_Any_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void I422ToRGBARow_Any_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
void I422ToRGB24Row_Any_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
void I422ToRAWRow_Any_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
void NV12ToARGBRow_Any_NEON(const uint8* y_buf,
|
||||
const uint8* uv_buf,
|
||||
uint8* argb_buf,
|
||||
int width);
|
||||
void NV21ToARGBRow_Any_NEON(const uint8* y_buf,
|
||||
const uint8* uv_buf,
|
||||
uint8* argb_buf,
|
||||
int width);
|
||||
|
||||
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
|
||||
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
||||
@ -671,4 +729,3 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
|
||||
#endif // INCLUDE_LIBYUV_ROW_H_ NOLINT
|
||||
|
||||
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 395
|
||||
#define LIBYUV_VERSION 396
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -64,6 +64,7 @@
|
||||
|
||||
# sources.
|
||||
'source/compare.cc',
|
||||
'source/compare_neon.cc',
|
||||
'source/convert.cc',
|
||||
'source/convert_argb.cc',
|
||||
'source/convert_from.cc',
|
||||
@ -79,6 +80,7 @@
|
||||
'source/row_posix.cc',
|
||||
'source/row_win.cc',
|
||||
'source/scale.cc',
|
||||
'source/scale_neon.cc',
|
||||
'source/scale_argb.cc',
|
||||
'source/video_common.cc',
|
||||
],
|
||||
|
||||
@ -244,44 +244,10 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
|
||||
return seed;
|
||||
}
|
||||
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
|
||||
#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
|
||||
#define HAS_SUMSQUAREERROR_NEON
|
||||
|
||||
static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b,
|
||||
int count) {
|
||||
volatile uint32 sse;
|
||||
asm volatile (
|
||||
"vmov.u8 q7, #0 \n"
|
||||
"vmov.u8 q9, #0 \n"
|
||||
"vmov.u8 q8, #0 \n"
|
||||
"vmov.u8 q10, #0 \n"
|
||||
|
||||
"1: \n"
|
||||
"vld1.u8 {q0}, [%0]! \n"
|
||||
"vld1.u8 {q1}, [%1]! \n"
|
||||
"vsubl.u8 q2, d0, d2 \n"
|
||||
"vsubl.u8 q3, d1, d3 \n"
|
||||
"vmlal.s16 q7, d4, d4 \n"
|
||||
"vmlal.s16 q8, d6, d6 \n"
|
||||
"vmlal.s16 q8, d5, d5 \n"
|
||||
"vmlal.s16 q10, d7, d7 \n"
|
||||
"subs %2, %2, #16 \n"
|
||||
"bgt 1b \n"
|
||||
|
||||
"vadd.u32 q7, q7, q8 \n"
|
||||
"vadd.u32 q9, q9, q10 \n"
|
||||
"vadd.u32 q10, q7, q9 \n"
|
||||
"vpaddl.u32 q1, q10 \n"
|
||||
"vadd.u64 d0, d2, d3 \n"
|
||||
"vmov.32 %3, d0[0] \n"
|
||||
: "+r"(src_a),
|
||||
"+r"(src_b),
|
||||
"+r"(count),
|
||||
"=r"(sse)
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
|
||||
return sse;
|
||||
}
|
||||
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
|
||||
|
||||
#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
#define HAS_SUMSQUAREERROR_SSE2
|
||||
|
||||
62
source/compare_neon.cc
Normal file
62
source/compare_neon.cc
Normal file
@ -0,0 +1,62 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
|
||||
|
||||
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
volatile uint32 sse;
|
||||
asm volatile (
|
||||
"vmov.u8 q8, #0 \n"
|
||||
"vmov.u8 q10, #0 \n"
|
||||
"vmov.u8 q9, #0 \n"
|
||||
"vmov.u8 q11, #0 \n"
|
||||
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
"vld1.u8 {q0}, [%0]! \n"
|
||||
"vld1.u8 {q1}, [%1]! \n"
|
||||
"subs %2, %2, #16 \n"
|
||||
"vsubl.u8 q2, d0, d2 \n"
|
||||
"vsubl.u8 q3, d1, d3 \n"
|
||||
"vmlal.s16 q8, d4, d4 \n"
|
||||
"vmlal.s16 q9, d6, d6 \n"
|
||||
"vmlal.s16 q10, d5, d5 \n"
|
||||
"vmlal.s16 q11, d7, d7 \n"
|
||||
"bgt 1b \n"
|
||||
|
||||
"vadd.u32 q8, q8, q9 \n"
|
||||
"vadd.u32 q10, q10, q11 \n"
|
||||
"vadd.u32 q11, q8, q10 \n"
|
||||
"vpaddl.u32 q1, q11 \n"
|
||||
"vadd.u64 d0, d2, d3 \n"
|
||||
"vmov.32 %3, d0[0] \n"
|
||||
: "+r"(src_a),
|
||||
"+r"(src_b),
|
||||
"+r"(count),
|
||||
"=r"(sse)
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
|
||||
return sse;
|
||||
}
|
||||
|
||||
#endif // __ARM_NEON__
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
|
||||
@ -62,6 +62,7 @@ int I420Copy(const uint8* src_y, int src_stride_y,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Move to row_win etc.
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
#define HAS_HALFROW_SSE2
|
||||
__declspec(naked) __declspec(align(16))
|
||||
@ -188,7 +189,7 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
|
||||
|
||||
// Blends 32x2 pixels to 16x1
|
||||
// source in scale.cc
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
|
||||
#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
|
||||
#define HAS_SCALEROWDOWN2_NEON
|
||||
void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width);
|
||||
@ -393,7 +394,7 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
|
||||
// M420 format description:
|
||||
// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
|
||||
// Chroma is half width / half height. (420)
|
||||
// src_stride_m420 is row planar. Normally this will be the width in pixels.
|
||||
// src_stride_m420 is row planar. Normally this will be the width in pixels.
|
||||
// The UV plane is half width, but 2 values, so src_stride_m420 applies to
|
||||
// this as well as the two Y planes.
|
||||
static int X420ToI420(const uint8* src_y,
|
||||
@ -592,10 +593,10 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
|
||||
// This policy assumes that the caller handles the last row of an odd height
|
||||
// image using C.
|
||||
// READSAFE_PAGE - enable read ahead within same page.
|
||||
// A page is 4096 bytes. When reading ahead, if the last pixel is near the
|
||||
// A page is 4096 bytes. When reading ahead, if the last pixel is near the
|
||||
// end the page, and a read spans the page into the next page, a memory
|
||||
// exception can occur if that page has not been allocated, or is a guard
|
||||
// page. This setting ensures the overread is within the same page.
|
||||
// page. This setting ensures the overread is within the same page.
|
||||
// READSAFE_ALWAYS - enables read ahead on systems without memory exceptions
|
||||
// or where buffers are padded by 64 bytes.
|
||||
|
||||
@ -790,7 +791,7 @@ static inline uint32 READWORD(const uint8* p) {
|
||||
}
|
||||
#endif
|
||||
|
||||
// Must be multiple of 6 pixels. Will over convert to handle remainder.
|
||||
// Must be multiple of 6 pixels. Will over convert to handle remainder.
|
||||
// https://developer.apple.com/quicktime/icefloe/dispatch019.html#v210
|
||||
static void V210ToUYVYRow_C(const uint8* src_v210, uint8* dst_uyvy, int width) {
|
||||
for (int x = 0; x < width; x += 6) {
|
||||
@ -820,7 +821,7 @@ static void V210ToUYVYRow_C(const uint8* src_v210, uint8* dst_uyvy, int width) {
|
||||
}
|
||||
|
||||
// Convert V210 to I420.
|
||||
// V210 is 10 bit version of UYVY. 16 bytes to store 6 pixels.
|
||||
// V210 is 10 bit version of UYVY. 16 bytes to store 6 pixels.
|
||||
// With is multiple of 48.
|
||||
LIBYUV_API
|
||||
int V210ToI420(const uint8* src_v210, int src_stride_v210,
|
||||
@ -1611,7 +1612,7 @@ static void JpegI400ToI420(void* opaque,
|
||||
}
|
||||
|
||||
// MJPG (Motion JPeg) to I420
|
||||
// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
|
||||
// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
|
||||
LIBYUV_API
|
||||
int MJPGToI420(const uint8* sample,
|
||||
size_t sample_size,
|
||||
@ -1689,7 +1690,7 @@ int MJPGToI420(const uint8* sample,
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
|
||||
} else {
|
||||
// TODO(fbarchard): Implement conversion for any other colorspace/sample
|
||||
// factors that occur in practice. 411 is supported by libjpeg
|
||||
// factors that occur in practice. 411 is supported by libjpeg
|
||||
// ERROR: Unable to convert MJPEG frame because format is not supported
|
||||
mjpeg_decoder.UnloadFrame();
|
||||
return 1;
|
||||
@ -1734,7 +1735,7 @@ int ConvertToI420(const uint8* sample,
|
||||
}
|
||||
int r = 0;
|
||||
|
||||
// One pass rotation is available for some formats. For the rest, convert
|
||||
// One pass rotation is available for some formats. For the rest, convert
|
||||
// to I420 (with optional vertical flipping) into a temporary I420 buffer,
|
||||
// and then rotate the I420 to the final destination buffer.
|
||||
// For in-place conversion, if destination y is same as source sample,
|
||||
|
||||
@ -556,6 +556,14 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_NV12TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
|
||||
NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
NV12ToARGBRow = NV12ToARGBRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
NV12ToARGBRow(src_y, src_uv, dst_argb, width);
|
||||
@ -571,10 +579,10 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
|
||||
// Convert NV21 to ARGB.
|
||||
LIBYUV_API
|
||||
int NV21ToARGB(const uint8* src_y, int src_stride_y,
|
||||
const uint8* src_vu, int src_stride_vu,
|
||||
const uint8* src_uv, int src_stride_uv,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height) {
|
||||
if (!src_y || !src_vu || !dst_argb ||
|
||||
if (!src_y || !src_uv || !dst_argb ||
|
||||
width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
@ -585,7 +593,7 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
|
||||
dst_stride_argb = -dst_stride_argb;
|
||||
}
|
||||
void (*NV21ToARGBRow)(const uint8* y_buf,
|
||||
const uint8* vu_buf,
|
||||
const uint8* uv_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) = NV21ToARGBRow_C;
|
||||
#if defined(HAS_NV21TOARGBROW_SSSE3)
|
||||
@ -599,13 +607,21 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_NV21TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
|
||||
NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
NV21ToARGBRow = NV21ToARGBRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
NV21ToARGBRow(src_y, src_vu, dst_argb, width);
|
||||
NV21ToARGBRow(src_y, src_uv, dst_argb, width);
|
||||
dst_argb += dst_stride_argb;
|
||||
src_y += src_stride_y;
|
||||
if (y & 1) {
|
||||
src_vu += src_stride_vu;
|
||||
src_uv += src_stride_uv;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
@ -890,7 +906,7 @@ static void JpegI400ToARGB(void* opaque,
|
||||
}
|
||||
|
||||
// MJPG (Motion JPeg) to ARGB
|
||||
// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
|
||||
// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
|
||||
LIBYUV_API
|
||||
int MJPGToARGB(const uint8* sample,
|
||||
size_t sample_size,
|
||||
@ -966,7 +982,7 @@ int MJPGToARGB(const uint8* sample,
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
|
||||
} else {
|
||||
// TODO(fbarchard): Implement conversion for any other colorspace/sample
|
||||
// factors that occur in practice. 411 is supported by libjpeg
|
||||
// factors that occur in practice. 411 is supported by libjpeg
|
||||
// ERROR: Unable to convert MJPEG frame because format is not supported
|
||||
mjpeg_decoder.UnloadFrame();
|
||||
return 1;
|
||||
@ -1004,7 +1020,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
|
||||
}
|
||||
int r = 0;
|
||||
|
||||
// One pass rotation is available for some formats. For the rest, convert
|
||||
// One pass rotation is available for some formats. For the rest, convert
|
||||
// to I420 (with optional vertical flipping) into a temporary I420 buffer,
|
||||
// and then rotate the I420 to the final destination buffer.
|
||||
// For in-place conversion, if destination dst_argb is same as source sample,
|
||||
|
||||
@ -203,7 +203,7 @@ int I420ToI411(const uint8* src_y, int src_stride_y,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Copy to I400. Source can be I420,422,444,400,NV12,NV21
|
||||
// Copy to I400. Source can be I420,422,444,400,NV12,NV21
|
||||
LIBYUV_API
|
||||
int I400Copy(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
@ -895,68 +895,50 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y,
|
||||
}
|
||||
|
||||
// Convert I420 to RGB24.
|
||||
// TODO(fbarchard): One step I420ToRGB24Row_NEON.
|
||||
LIBYUV_API
|
||||
int I420ToRGB24(const uint8* src_y, int src_stride_y,
|
||||
const uint8* src_u, int src_stride_u,
|
||||
const uint8* src_v, int src_stride_v,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
uint8* dst_rgb24, int dst_stride_rgb24,
|
||||
int width, int height) {
|
||||
if (!src_y || !src_u || !src_v ||
|
||||
!dst_argb ||
|
||||
!dst_rgb24 ||
|
||||
width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
|
||||
dst_stride_argb = -dst_stride_argb;
|
||||
dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
|
||||
dst_stride_rgb24 = -dst_stride_rgb24;
|
||||
}
|
||||
void (*I422ToARGBRow)(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) = I422ToARGBRow_C;
|
||||
#if defined(HAS_I422TOARGBROW_NEON)
|
||||
void (*I422ToRGB24Row)(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) = I422ToRGB24Row_C;
|
||||
#if defined(HAS_I422TORGB24ROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
I422ToARGBRow = I422ToARGBRow_NEON;
|
||||
}
|
||||
#elif defined(HAS_I422TOARGBROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
I422ToARGBRow = I422ToARGBRow_SSSE3;
|
||||
}
|
||||
#endif
|
||||
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride]);
|
||||
void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
|
||||
ARGBToRGB24Row_C;
|
||||
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
if (width * 3 <= kMaxStride) {
|
||||
ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
|
||||
}
|
||||
if (IS_ALIGNED(width, 16) &&
|
||||
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
|
||||
ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
|
||||
I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422ToRGB24Row = I422ToRGB24Row_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORGB24ROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
if (width * 3 <= kMaxStride) {
|
||||
ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
|
||||
}
|
||||
#elif defined(HAS_I422TORGB24ROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
|
||||
I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGBToRGB24Row = ARGBToRGB24Row_NEON;
|
||||
I422ToRGB24Row = I422ToRGB24Row_Unaligned_SSSE3;
|
||||
if (IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
|
||||
I422ToRGB24Row = I422ToRGB24Row_SSSE3;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
I422ToARGBRow(src_y, src_u, src_v, row, width);
|
||||
ARGBToRGB24Row(row, dst_argb, width);
|
||||
dst_argb += dst_stride_argb;
|
||||
I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
|
||||
dst_rgb24 += dst_stride_rgb24;
|
||||
src_y += src_stride_y;
|
||||
if (y & 1) {
|
||||
src_u += src_stride_u;
|
||||
@ -967,67 +949,50 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
|
||||
}
|
||||
|
||||
// Convert I420 to RAW.
|
||||
// TODO(fbarchard): One step I420ToRAWRow_NEON.
|
||||
LIBYUV_API
|
||||
int I420ToRAW(const uint8* src_y, int src_stride_y,
|
||||
const uint8* src_u, int src_stride_u,
|
||||
const uint8* src_v, int src_stride_v,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
uint8* dst_raw, int dst_stride_raw,
|
||||
int width, int height) {
|
||||
if (!src_y || !src_u || !src_v ||
|
||||
!dst_argb ||
|
||||
!dst_raw ||
|
||||
width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
|
||||
dst_stride_argb = -dst_stride_argb;
|
||||
dst_raw = dst_raw + (height - 1) * dst_stride_raw;
|
||||
dst_stride_raw = -dst_stride_raw;
|
||||
}
|
||||
void (*I422ToARGBRow)(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) = I422ToARGBRow_C;
|
||||
#if defined(HAS_I422TOARGBROW_NEON)
|
||||
void (*I422ToRAWRow)(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) = I422ToRAWRow_C;
|
||||
#if defined(HAS_I422TORAWROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
I422ToARGBRow = I422ToARGBRow_NEON;
|
||||
}
|
||||
#elif defined(HAS_I422TOARGBROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
I422ToARGBRow = I422ToARGBRow_SSSE3;
|
||||
}
|
||||
#endif
|
||||
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride]);
|
||||
void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
|
||||
ARGBToRAWRow_C;
|
||||
#if defined(HAS_ARGBTORAWROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
if (width * 3 <= kMaxStride) {
|
||||
ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
|
||||
}
|
||||
if (IS_ALIGNED(width, 16) &&
|
||||
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
|
||||
ARGBToRAWRow = ARGBToRAWRow_SSSE3;
|
||||
I422ToRAWRow = I422ToRAWRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422ToRAWRow = I422ToRAWRow_NEON;
|
||||
}
|
||||
}
|
||||
#elif defined(HAS_ARGBTORAWROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
if (width * 3 <= kMaxStride) {
|
||||
ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
|
||||
}
|
||||
#elif defined(HAS_I422TORAWROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
|
||||
I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGBToRAWRow = ARGBToRAWRow_NEON;
|
||||
I422ToRAWRow = I422ToRAWRow_Unaligned_SSSE3;
|
||||
if (IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
|
||||
I422ToRAWRow = I422ToRAWRow_SSSE3;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
I422ToARGBRow(src_y, src_u, src_v, row, width);
|
||||
ARGBToRAWRow(row, dst_argb, width);
|
||||
dst_argb += dst_stride_argb;
|
||||
I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
|
||||
dst_raw += dst_stride_raw;
|
||||
src_y += src_stride_y;
|
||||
if (y & 1) {
|
||||
src_u += src_stride_u;
|
||||
|
||||
@ -29,7 +29,7 @@
|
||||
// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
|
||||
#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
|
||||
static __inline void __cpuid(int cpu_info[4], int info_type) {
|
||||
asm volatile (
|
||||
asm volatile ( // NOLINT
|
||||
"mov %%ebx, %%edi \n"
|
||||
"cpuid \n"
|
||||
"xchg %%edi, %%ebx \n"
|
||||
@ -38,7 +38,7 @@ static __inline void __cpuid(int cpu_info[4], int info_type) {
|
||||
}
|
||||
#elif defined(__i386__) || defined(__x86_64__)
|
||||
static __inline void __cpuid(int cpu_info[4], int info_type) {
|
||||
asm volatile (
|
||||
asm volatile ( // NOLINT
|
||||
"cpuid \n"
|
||||
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
|
||||
: "a"(info_type));
|
||||
@ -50,7 +50,7 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Low level cpuid for X86. Returns zeros on other CPUs.
|
||||
// Low level cpuid for X86. Returns zeros on other CPUs.
|
||||
#if !defined(__CLR_VER) && (defined(_M_IX86) || defined(_M_X64) || \
|
||||
defined(__i386__) || defined(__x86_64__))
|
||||
LIBYUV_API
|
||||
@ -85,7 +85,7 @@ static uint32 XGetBV(unsigned int xcr) {
|
||||
#define HAS_XGETBV
|
||||
static uint32 XGetBV(unsigned int xcr) {
|
||||
uint32 xcr_feature_mask;
|
||||
asm volatile (
|
||||
asm volatile ( // NOLINT
|
||||
".byte 0x0f, 0x01, 0xd0\n"
|
||||
: "=a"(xcr_feature_mask)
|
||||
: "c"(xcr)
|
||||
@ -124,6 +124,18 @@ int ArmCpuCaps(const char* cpuinfo_name) {
|
||||
LIBYUV_API
|
||||
int cpu_info_ = 0;
|
||||
|
||||
// Test environment variable for disabling CPU features. Any non-zero value
|
||||
// to disable. Zero ignored to make it easy to set the variable on/off.
|
||||
static bool TestEnv(const char* name) {
|
||||
const char* var = getenv(name);
|
||||
if (var) {
|
||||
if (var[0] != '0') {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int InitCpuFlags(void) {
|
||||
#if !defined(__CLR_VER) && defined(CPU_X86)
|
||||
@ -144,34 +156,33 @@ int InitCpuFlags(void) {
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// environment variable overrides for testing.
|
||||
if (getenv("LIBYUV_DISABLE_X86")) {
|
||||
if (TestEnv("LIBYUV_DISABLE_X86")) {
|
||||
cpu_info_ &= ~kCpuHasX86;
|
||||
}
|
||||
if (getenv("LIBYUV_DISABLE_SSE2")) {
|
||||
if (TestEnv("LIBYUV_DISABLE_SSE2")) {
|
||||
cpu_info_ &= ~kCpuHasSSE2;
|
||||
}
|
||||
if (getenv("LIBYUV_DISABLE_SSSE3")) {
|
||||
if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
|
||||
cpu_info_ &= ~kCpuHasSSSE3;
|
||||
}
|
||||
if (getenv("LIBYUV_DISABLE_SSE41")) {
|
||||
if (TestEnv("LIBYUV_DISABLE_SSE41")) {
|
||||
cpu_info_ &= ~kCpuHasSSE41;
|
||||
}
|
||||
if (getenv("LIBYUV_DISABLE_SSE42")) {
|
||||
if (TestEnv("LIBYUV_DISABLE_SSE42")) {
|
||||
cpu_info_ &= ~kCpuHasSSE42;
|
||||
}
|
||||
if (getenv("LIBYUV_DISABLE_AVX")) {
|
||||
if (TestEnv("LIBYUV_DISABLE_AVX")) {
|
||||
cpu_info_ &= ~kCpuHasAVX;
|
||||
}
|
||||
if (getenv("LIBYUV_DISABLE_AVX2")) {
|
||||
if (TestEnv("LIBYUV_DISABLE_AVX2")) {
|
||||
cpu_info_ &= ~kCpuHasAVX2;
|
||||
}
|
||||
if (getenv("LIBYUV_DISABLE_ASM")) {
|
||||
if (TestEnv("LIBYUV_DISABLE_ASM")) {
|
||||
cpu_info_ = kCpuInitialized;
|
||||
}
|
||||
#elif defined(__arm__)
|
||||
#if defined(__linux__) && defined(__ARM_NEON__)
|
||||
#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
|
||||
// linux arm parse text file for neon detect.
|
||||
cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
|
||||
#elif defined(__ARM_NEON__)
|
||||
@ -181,10 +192,10 @@ int InitCpuFlags(void) {
|
||||
cpu_info_ = kCpuHasNEON;
|
||||
#endif
|
||||
cpu_info_ |= kCpuInitialized | kCpuHasARM;
|
||||
if (getenv("LIBYUV_DISABLE_NEON")) {
|
||||
if (TestEnv("LIBYUV_DISABLE_NEON")) {
|
||||
cpu_info_ &= ~kCpuHasNEON;
|
||||
}
|
||||
if (getenv("LIBYUV_DISABLE_ASM")) {
|
||||
if (TestEnv("LIBYUV_DISABLE_ASM")) {
|
||||
cpu_info_ = kCpuInitialized;
|
||||
}
|
||||
#endif // __arm__
|
||||
|
||||
@ -21,7 +21,7 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
|
||||
// and vst would select which 2 components to write. The low level would need
|
||||
// and vst would select which 2 components to write. The low level would need
|
||||
// to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR
|
||||
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
|
||||
#include "libyuv/mjpeg_decoder.h"
|
||||
|
||||
#ifdef HAVE_JPEG
|
||||
// Must be included before jpeglib
|
||||
#include <assert.h>
|
||||
#ifndef __CLR_VER
|
||||
@ -80,7 +81,7 @@ MJpegDecoder::~MJpegDecoder() {
|
||||
}
|
||||
|
||||
// Helper function to validate the jpeg looks ok.
|
||||
// TODO(fbarchard): Improve performance. Scan backward for EOI?
|
||||
// TODO(fbarchard): Improve performance. Scan backward for EOI?
|
||||
bool ValidateJpeg(const uint8* sample, size_t sample_size) {
|
||||
if (sample_size < 64) {
|
||||
// ERROR: Invalid jpeg size: sample_size
|
||||
@ -105,7 +106,7 @@ bool ValidateJpeg(const uint8* sample, size_t sample_size) {
|
||||
}
|
||||
}
|
||||
if (!total_eoi) {
|
||||
// ERROR: Invalid jpeg end code not found. Size sample_size
|
||||
// ERROR: Invalid jpeg end code not found. Size sample_size
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@ -578,3 +579,5 @@ JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
|
||||
}
|
||||
|
||||
} // namespace libyuv
|
||||
#endif // HAVE_JPEG
|
||||
|
||||
|
||||
@ -105,6 +105,130 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
|
||||
}
|
||||
}
|
||||
|
||||
// Convert YUY2 to I422.
|
||||
LIBYUV_API
|
||||
int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height) {
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
|
||||
src_stride_yuy2 = -src_stride_yuy2;
|
||||
}
|
||||
void (*YUY2ToUV422Row)(const uint8* src_yuy2,
|
||||
uint8* dst_u, uint8* dst_v, int pix);
|
||||
void (*YUY2ToYRow)(const uint8* src_yuy2,
|
||||
uint8* dst_y, int pix);
|
||||
YUY2ToYRow = YUY2ToYRow_C;
|
||||
YUY2ToUV422Row = YUY2ToUV422Row_C;
|
||||
#if defined(HAS_YUY2TOYROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
if (width > 16) {
|
||||
YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
|
||||
YUY2ToYRow = YUY2ToYRow_Any_SSE2;
|
||||
}
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
|
||||
YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
|
||||
if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
|
||||
YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
|
||||
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
|
||||
YUY2ToYRow = YUY2ToYRow_SSE2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(HAS_YUY2TOYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
if (width > 8) {
|
||||
YUY2ToYRow = YUY2ToYRow_Any_NEON;
|
||||
if (width > 16) {
|
||||
YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
|
||||
}
|
||||
}
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
YUY2ToYRow = YUY2ToYRow_NEON;
|
||||
YUY2ToUV422Row = YUY2ToUV422Row_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
|
||||
YUY2ToYRow(src_yuy2, dst_y, width);
|
||||
src_yuy2 += src_stride_yuy2;
|
||||
dst_y += dst_stride_y;
|
||||
dst_u += dst_stride_u;
|
||||
dst_v += dst_stride_v;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Convert UYVY to I422.
|
||||
LIBYUV_API
|
||||
int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height) {
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
|
||||
src_stride_uyvy = -src_stride_uyvy;
|
||||
}
|
||||
void (*UYVYToUV422Row)(const uint8* src_uyvy,
|
||||
uint8* dst_u, uint8* dst_v, int pix);
|
||||
void (*UYVYToYRow)(const uint8* src_uyvy,
|
||||
uint8* dst_y, int pix);
|
||||
UYVYToYRow = UYVYToYRow_C;
|
||||
UYVYToUV422Row = UYVYToUV422Row_C;
|
||||
#if defined(HAS_UYVYTOYROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
if (width > 16) {
|
||||
UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
|
||||
UYVYToYRow = UYVYToYRow_Any_SSE2;
|
||||
}
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
|
||||
UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
|
||||
if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
|
||||
UYVYToUV422Row = UYVYToUV422Row_SSE2;
|
||||
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
|
||||
UYVYToYRow = UYVYToYRow_SSE2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(HAS_UYVYTOYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
if (width > 8) {
|
||||
UYVYToYRow = UYVYToYRow_Any_NEON;
|
||||
if (width > 16) {
|
||||
UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
|
||||
}
|
||||
}
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
UYVYToYRow = UYVYToYRow_NEON;
|
||||
UYVYToUV422Row = UYVYToUV422Row_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
|
||||
UYVYToYRow(src_uyvy, dst_y, width);
|
||||
src_uyvy += src_stride_uyvy;
|
||||
dst_y += dst_stride_y;
|
||||
dst_u += dst_stride_u;
|
||||
dst_v += dst_stride_v;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Mirror I420 with optional flipping
|
||||
LIBYUV_API
|
||||
int I420Mirror(const uint8* src_y, int src_stride_y,
|
||||
@ -721,6 +845,11 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
|
||||
NV12ToARGBRow = NV12ToARGBRow_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_NV12TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && width * 4 <= kMaxStride) {
|
||||
NV12ToARGBRow = NV12ToARGBRow_NEON;
|
||||
}
|
||||
#endif
|
||||
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride]);
|
||||
void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
|
||||
@ -789,129 +918,6 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// SetRow8 writes 'count' bytes using a 32 bit value repeated
|
||||
// SetRow32 writes 'count' words using a 32 bit value repeated
|
||||
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
|
||||
#define HAS_SETROW_NEON
|
||||
static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
|
||||
asm volatile ( // NOLINT
|
||||
"vdup.u32 q0, %2 \n" // duplicate 4 ints
|
||||
"1: \n"
|
||||
"subs %1, %1, #16 \n" // 16 bytes per loop
|
||||
"vst1.u32 {q0}, [%0]! \n" // store
|
||||
"bgt 1b \n"
|
||||
: "+r"(dst), // %0
|
||||
"+r"(count) // %1
|
||||
: "r"(v32) // %2
|
||||
: "q0", "memory", "cc");
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Make fully assembler
|
||||
static void SetRows32_NEON(uint8* dst, uint32 v32, int width,
|
||||
int dst_stride, int height) {
|
||||
for (int y = 0; y < height; ++y) {
|
||||
SetRow8_NEON(dst, v32, width << 2);
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
#define HAS_SETROW_X86
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void SetRow8_X86(uint8* dst, uint32 v32, int count) {
|
||||
__asm {
|
||||
mov edx, edi
|
||||
mov edi, [esp + 4] // dst
|
||||
mov eax, [esp + 8] // v32
|
||||
mov ecx, [esp + 12] // count
|
||||
shr ecx, 2
|
||||
rep stosd
|
||||
mov edi, edx
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void SetRows32_X86(uint8* dst, uint32 v32, int width,
|
||||
int dst_stride, int height) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
push ebp
|
||||
mov edi, [esp + 12 + 4] // dst
|
||||
mov eax, [esp + 12 + 8] // v32
|
||||
mov ebp, [esp + 12 + 12] // width
|
||||
mov edx, [esp + 12 + 16] // dst_stride
|
||||
mov esi, [esp + 12 + 20] // height
|
||||
lea ecx, [ebp * 4]
|
||||
sub edx, ecx // stride - width * 4
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
mov ecx, ebp
|
||||
rep stosd
|
||||
add edi, edx
|
||||
sub esi, 1
|
||||
jg convertloop
|
||||
|
||||
pop ebp
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
|
||||
#define HAS_SETROW_X86
|
||||
static void SetRow8_X86(uint8* dst, uint32 v32, int width) {
|
||||
size_t width_tmp = static_cast<size_t>(width);
|
||||
asm volatile ( // NOLINT
|
||||
"shr $0x2,%1 \n"
|
||||
"rep stosl \n"
|
||||
: "+D"(dst), // %0
|
||||
"+c"(width_tmp) // %1
|
||||
: "a"(v32) // %2
|
||||
: "memory", "cc");
|
||||
}
|
||||
|
||||
static void SetRows32_X86(uint8* dst, uint32 v32, int width,
|
||||
int dst_stride, int height) {
|
||||
for (int y = 0; y < height; ++y) {
|
||||
size_t width_tmp = static_cast<size_t>(width);
|
||||
uint32* d = reinterpret_cast<uint32*>(dst);
|
||||
asm volatile ( // NOLINT
|
||||
"rep stosl \n"
|
||||
: "+D"(d), // %0
|
||||
"+c"(width_tmp) // %1
|
||||
: "a"(v32) // %2
|
||||
: "memory", "cc");
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static void SetRow8_C(uint8* dst, uint32 v8, int count) {
|
||||
#ifdef _MSC_VER
|
||||
for (int x = 0; x < count; ++x) {
|
||||
dst[x] = v8;
|
||||
}
|
||||
#else
|
||||
memset(dst, v8, count);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void SetRows32_C(uint8* dst, uint32 v32, int width,
|
||||
int dst_stride, int height) {
|
||||
for (int y = 0; y < height; ++y) {
|
||||
uint32* d = reinterpret_cast<uint32*>(dst);
|
||||
for (int x = 0; x < width; ++x) {
|
||||
d[x] = v32;
|
||||
}
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
void SetPlane(uint8* dst_y, int dst_stride_y,
|
||||
int width, int height,
|
||||
@ -929,13 +935,6 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
|
||||
SetRow = SetRow8_X86;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SETROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||
IS_ALIGNED(width, 16) &&
|
||||
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
|
||||
SetRow = SetRow8_SSE2;
|
||||
}
|
||||
#endif
|
||||
|
||||
uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
|
||||
// Set plane
|
||||
@ -1242,7 +1241,7 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
|
||||
}
|
||||
|
||||
// Computes table of cumulative sum for image where the value is the sum
|
||||
// of all values above and to the left of the entry. Used by ARGBBlur.
|
||||
// of all values above and to the left of the entry. Used by ARGBBlur.
|
||||
LIBYUV_API
|
||||
int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
|
||||
int32* dst_cumsum, int dst_stride32_cumsum,
|
||||
@ -1270,7 +1269,7 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
|
||||
|
||||
// Blur ARGB image.
|
||||
// Caller should allocate CumulativeSum table of width * height * 16 bytes
|
||||
// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
|
||||
// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
|
||||
// as the buffer is treated as circular.
|
||||
LIBYUV_API
|
||||
int ARGBBlur(const uint8* src_argb, int src_stride_argb,
|
||||
@ -1290,7 +1289,7 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
|
||||
CumulativeSumToAverage = CumulativeSumToAverage_SSE2;
|
||||
}
|
||||
#endif
|
||||
// Compute enough CumulativeSum for first row to be blurred. After this
|
||||
// Compute enough CumulativeSum for first row to be blurred. After this
|
||||
// one row of CumulativeSum is updated at a time.
|
||||
ARGBComputeCumulativeSum(src_argb, src_stride_argb,
|
||||
dst_cumsum, dst_stride32_cumsum,
|
||||
|
||||
@ -814,7 +814,7 @@ void RotatePlane90(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width, int height) {
|
||||
// Rotate by 90 is a transpose with the source read
|
||||
// from bottom to top. So set the source pointer to the end
|
||||
// from bottom to top. So set the source pointer to the end
|
||||
// of the buffer and flip the sign of the source stride.
|
||||
src += src_stride * (height - 1);
|
||||
src_stride = -src_stride;
|
||||
@ -826,7 +826,7 @@ void RotatePlane270(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width, int height) {
|
||||
// Rotate by 270 is a transpose with the destination written
|
||||
// from bottom to top. So set the destination pointer to the end
|
||||
// from bottom to top. So set the destination pointer to the end
|
||||
// of the buffer and flip the sign of the destination stride.
|
||||
dst += dst_stride * (width - 1);
|
||||
dst_stride = -dst_stride;
|
||||
@ -880,7 +880,7 @@ void RotatePlane180(const uint8* src, int src_stride,
|
||||
if (width > kMaxStride) {
|
||||
return;
|
||||
}
|
||||
// Swap first and last row and mirror the content. Uses a temporary row.
|
||||
// Swap first and last row and mirror the content. Uses a temporary row.
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride]);
|
||||
const uint8* src_bot = src + src_stride * (height - 1);
|
||||
uint8* dst_bot = dst + dst_stride * (height - 1);
|
||||
|
||||
@ -58,7 +58,7 @@ void ARGBRotate90(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width, int height) {
|
||||
// Rotate by 90 is a ARGBTranspose with the source read
|
||||
// from bottom to top. So set the source pointer to the end
|
||||
// from bottom to top. So set the source pointer to the end
|
||||
// of the buffer and flip the sign of the source stride.
|
||||
src += src_stride * (height - 1);
|
||||
src_stride = -src_stride;
|
||||
@ -69,7 +69,7 @@ void ARGBRotate270(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width, int height) {
|
||||
// Rotate by 270 is a ARGBTranspose with the destination written
|
||||
// from bottom to top. So set the destination pointer to the end
|
||||
// from bottom to top. So set the destination pointer to the end
|
||||
// of the buffer and flip the sign of the destination stride.
|
||||
dst += dst_stride * (width - 1);
|
||||
dst_stride = -dst_stride;
|
||||
@ -109,7 +109,7 @@ void ARGBRotate180(const uint8* src, int src_stride,
|
||||
if (width * 4 > kMaxStride) {
|
||||
return;
|
||||
}
|
||||
// Swap first and last row and mirror the content. Uses a temporary row.
|
||||
// Swap first and last row and mirror the content. Uses a temporary row.
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride]);
|
||||
const uint8* src_bot = src + src_stride * (height - 1);
|
||||
uint8* dst_bot = dst + dst_stride * (height - 1);
|
||||
|
||||
@ -26,12 +26,12 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width) {
|
||||
asm volatile (
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %4, #8 \n"
|
||||
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
".p2align 4 \n"
|
||||
"1: \n"
|
||||
"mov r9, %0 \n"
|
||||
@ -81,7 +81,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
|
||||
"subs %4, #8 \n" // w -= 8
|
||||
"bge 1b \n"
|
||||
|
||||
// add 8 back to counter. if the result is 0 there are
|
||||
// add 8 back to counter. if the result is 0 there are
|
||||
// no residuals.
|
||||
"adds %4, #8 \n"
|
||||
"beq 4f \n"
|
||||
@ -193,12 +193,12 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width) {
|
||||
asm volatile (
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %6, #8 \n"
|
||||
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
".p2align 4 \n"
|
||||
"1: \n"
|
||||
"mov r9, %0 \n"
|
||||
@ -264,7 +264,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
|
||||
"subs %6, #8 \n" // w -= 8
|
||||
"bge 1b \n"
|
||||
|
||||
// add 8 back to counter. if the result is 0 there are
|
||||
// add 8 back to counter. if the result is 0 there are
|
||||
// no residuals.
|
||||
"adds %6, #8 \n"
|
||||
"beq 4f \n"
|
||||
|
||||
@ -330,7 +330,7 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) {
|
||||
int sb = (b * 17 + g * 68 + r * 35) >> 7;
|
||||
int sg = (b * 22 + g * 88 + r * 45) >> 7;
|
||||
int sr = (b * 24 + g * 98 + r * 50) >> 7;
|
||||
// b does not over flow. a is preserved from original.
|
||||
// b does not over flow. a is preserved from original.
|
||||
if (sg > 255) {
|
||||
sg = 255;
|
||||
}
|
||||
@ -344,7 +344,7 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) {
|
||||
}
|
||||
}
|
||||
|
||||
// Apply color matrix to a row of image. Matrix is signed.
|
||||
// Apply color matrix to a row of image. Matrix is signed.
|
||||
void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
int b = dst_argb[0];
|
||||
@ -459,6 +459,14 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf,
|
||||
(255u << ashift);
|
||||
}
|
||||
|
||||
static __inline void YuvPixel2(uint8 y, uint8 u, uint8 v,
|
||||
uint8* b, uint8* g, uint8* r) {
|
||||
int32 y1 = (static_cast<int32>(y) - 16) * YG;
|
||||
*b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
|
||||
*g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
|
||||
*r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
|
||||
}
|
||||
|
||||
void I444ToARGBRow_C(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
@ -492,6 +500,48 @@ void I422ToARGBRow_C(const uint8* y_buf,
|
||||
}
|
||||
}
|
||||
|
||||
void I422ToRGB24Row_C(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
for (int x = 0; x < width - 1; x += 2) {
|
||||
YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
|
||||
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
|
||||
YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
|
||||
rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
|
||||
y_buf += 2;
|
||||
u_buf += 1;
|
||||
v_buf += 1;
|
||||
rgb_buf += 6; // Advance 2 pixels.
|
||||
}
|
||||
if (width & 1) {
|
||||
YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
|
||||
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
|
||||
}
|
||||
}
|
||||
|
||||
void I422ToRAWRow_C(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
for (int x = 0; x < width - 1; x += 2) {
|
||||
YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
|
||||
rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
|
||||
YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
|
||||
rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
|
||||
y_buf += 2;
|
||||
u_buf += 1;
|
||||
v_buf += 1;
|
||||
rgb_buf += 6; // Advance 2 pixels.
|
||||
}
|
||||
if (width & 1) {
|
||||
YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
|
||||
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
|
||||
}
|
||||
}
|
||||
|
||||
void I411ToARGBRow_C(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
@ -671,6 +721,28 @@ void CopyRow_C(const uint8* src, uint8* dst, int count) {
|
||||
memcpy(dst, src, count);
|
||||
}
|
||||
|
||||
void SetRow8_C(uint8* dst, uint32 v8, int count) {
|
||||
#ifdef _MSC_VER
|
||||
// VC will generate rep stosb.
|
||||
for (int x = 0; x < count; ++x) {
|
||||
dst[x] = v8;
|
||||
}
|
||||
#else
|
||||
memset(dst, v8, count);
|
||||
#endif
|
||||
}
|
||||
|
||||
void SetRows32_C(uint8* dst, uint32 v32, int width,
|
||||
int dst_stride, int height) {
|
||||
for (int y = 0; y < height; ++y) {
|
||||
uint32* d = reinterpret_cast<uint32*>(dst);
|
||||
for (int x = 0; x < width; ++x) {
|
||||
d[x] = v32;
|
||||
}
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
// Filter 2 rows of YUY2 UV's (422) into U and V (420).
|
||||
void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
@ -950,6 +1022,11 @@ Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0)
|
||||
YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
|
||||
YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
|
||||
#endif
|
||||
#ifdef HAS_I422TORGB24ROW_SSSE3
|
||||
YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_Unaligned_SSSE3, \
|
||||
I422ToRGB24Row_C, 1)
|
||||
YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_Unaligned_SSSE3, I422ToRAWRow_C, 1)
|
||||
#endif
|
||||
#ifdef HAS_I422TORGBAROW_SSSE3
|
||||
YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1)
|
||||
#endif
|
||||
@ -958,6 +1035,10 @@ YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1)
|
||||
YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1)
|
||||
YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1)
|
||||
YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1)
|
||||
Y2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0)
|
||||
Y2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0)
|
||||
YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1)
|
||||
YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1)
|
||||
#endif
|
||||
#undef YANY
|
||||
|
||||
|
||||
@ -101,8 +101,8 @@ void I422ToARGBRow_NEON(const uint8* y_buf,
|
||||
"+r"(width) // %4
|
||||
: "r"(&kUVToRB), // %5
|
||||
"r"(&kUVToG) // %6
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
|
||||
"q12", "q13", "q14", "q15"
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
#endif // HAS_I422TOARGBROW_NEON
|
||||
@ -135,8 +135,8 @@ void I422ToBGRARow_NEON(const uint8* y_buf,
|
||||
"+r"(width) // %4
|
||||
: "r"(&kUVToRB), // %5
|
||||
"r"(&kUVToG) // %6
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9",
|
||||
"q10", "q11", "q12", "q13", "q14", "q15"
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
#endif // HAS_I422TOBGRAROW_NEON
|
||||
@ -169,8 +169,8 @@ void I422ToABGRRow_NEON(const uint8* y_buf,
|
||||
"+r"(width) // %4
|
||||
: "r"(&kUVToRB), // %5
|
||||
"r"(&kUVToG) // %6
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9",
|
||||
"q10", "q11", "q12", "q13", "q14", "q15"
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
#endif // HAS_I422TOABGRROW_NEON
|
||||
@ -202,12 +202,77 @@ void I422ToRGBARow_NEON(const uint8* y_buf,
|
||||
"+r"(width) // %4
|
||||
: "r"(&kUVToRB), // %5
|
||||
"r"(&kUVToG) // %6
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9",
|
||||
"q10", "q11", "q12", "q13", "q14", "q15"
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
#endif // HAS_I422TORGBAROW_NEON
|
||||
|
||||
#ifdef HAS_I422TORGB24ROW_NEON
|
||||
void I422ToRGB24Row_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
asm volatile (
|
||||
"vld1.u8 {d24}, [%5] \n"
|
||||
"vld1.u8 {d25}, [%6] \n"
|
||||
"vmov.u8 d26, #128 \n"
|
||||
"vmov.u16 q14, #74 \n"
|
||||
"vmov.u16 q15, #16 \n"
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB
|
||||
"subs %4, %4, #8 \n"
|
||||
"vst3.8 {d20, d21, d22}, [%3]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(y_buf), // %0
|
||||
"+r"(u_buf), // %1
|
||||
"+r"(v_buf), // %2
|
||||
"+r"(rgb_buf), // %3
|
||||
"+r"(width) // %4
|
||||
: "r"(&kUVToRB), // %5
|
||||
"r"(&kUVToG) // %6
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
#endif // HAS_I422TORGB24ROW_NEON
|
||||
|
||||
#ifdef HAS_I422TORAWROW_NEON
|
||||
void I422ToRAWRow_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
asm volatile (
|
||||
"vld1.u8 {d24}, [%5] \n"
|
||||
"vld1.u8 {d25}, [%6] \n"
|
||||
"vmov.u8 d26, #128 \n"
|
||||
"vmov.u16 q14, #74 \n"
|
||||
"vmov.u16 q15, #16 \n"
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB
|
||||
"subs %4, %4, #8 \n"
|
||||
"vswp.u8 d20, d22 \n"
|
||||
"vst3.8 {d20, d21, d22}, [%3]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(y_buf), // %0
|
||||
"+r"(u_buf), // %1
|
||||
"+r"(v_buf), // %2
|
||||
"+r"(rgb_buf), // %3
|
||||
"+r"(width) // %4
|
||||
: "r"(&kUVToRB), // %5
|
||||
"r"(&kUVToG) // %6
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
#endif // HAS_I422TORAWROW_NEON
|
||||
|
||||
#ifdef HAS_NV12TOARGBROW_NEON
|
||||
void NV12ToARGBRow_NEON(const uint8* y_buf,
|
||||
const uint8* uv_buf,
|
||||
@ -233,8 +298,8 @@ void NV12ToARGBRow_NEON(const uint8* y_buf,
|
||||
"+r"(width) // %3
|
||||
: "r"(&kUVToRB), // %4
|
||||
"r"(&kUVToG) // %5
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
|
||||
"q12", "q13", "q14", "q15"
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
#endif // HAS_NV12TOARGBROW_NEON
|
||||
@ -264,8 +329,8 @@ void NV21ToARGBRow_NEON(const uint8* y_buf,
|
||||
"+r"(width) // %3
|
||||
: "r"(&kUVToRB), // %4
|
||||
"r"(&kUVToG) // %5
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
|
||||
"q12", "q13", "q14", "q15"
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
#endif // HAS_NV21TOARGBROW_NEON
|
||||
@ -312,7 +377,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
|
||||
#endif // HAS_COPYROW_NEON
|
||||
|
||||
#ifdef HAS_SETROW_NEON
|
||||
// SetRow8 writes 'count' bytes using a 32 bit value repeated
|
||||
// SetRow8 writes 'count' bytes using a 32 bit value repeated.
|
||||
void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
|
||||
asm volatile ( // NOLINT
|
||||
"vdup.u32 q0, %2 \n" // duplicate 4 ints
|
||||
@ -327,7 +392,7 @@ void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Make fully assembler
|
||||
// SetRow32 writes 'count' words using a 32 bit value repeated
|
||||
// SetRow32 writes 'count' words using a 32 bit value repeated.
|
||||
void SetRows32_NEON(uint8* dst, uint32 v32, int width,
|
||||
int dst_stride, int height) {
|
||||
for (int y = 0; y < height; ++y) {
|
||||
@ -344,11 +409,11 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
||||
"add %1, %2 \n"
|
||||
// work on segments that are multiples of 16
|
||||
"lsrs r3, %2, #4 \n"
|
||||
// the output is written in two block. 8 bytes followed
|
||||
// by another 8. reading is done sequentially, from left to
|
||||
// right. writing is done from right to left in block sizes
|
||||
// the output is written in two block. 8 bytes followed
|
||||
// by another 8. reading is done sequentially, from left to
|
||||
// right. writing is done from right to left in block sizes
|
||||
// %1, the destination pointer is incremented after writing
|
||||
// the first of the two blocks. need to subtract that 8 off
|
||||
// the first of the two blocks. need to subtract that 8 off
|
||||
// along with 16 to get the next location.
|
||||
"mov r3, #-24 \n"
|
||||
"beq 2f \n"
|
||||
@ -356,9 +421,9 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
||||
// back of destination by the size of the register that is
|
||||
// going to be mirrored
|
||||
"sub %1, #16 \n"
|
||||
// the loop needs to run on blocks of 16. what will be left
|
||||
// the loop needs to run on blocks of 16. what will be left
|
||||
// over is either a negative number, the residuals that need
|
||||
// to be done, or 0. if this isn't subtracted off here the
|
||||
// to be done, or 0. If this isn't subtracted off here the
|
||||
// loop will run one extra time.
|
||||
"sub %2, #16 \n"
|
||||
|
||||
@ -375,7 +440,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
||||
"vst1.8 {d0}, [%1], r3 \n" // dst -= 16
|
||||
"bge 1b \n"
|
||||
|
||||
// add 16 back to the counter. if the result is 0 there is no
|
||||
// add 16 back to the counter. if the result is 0 there is no
|
||||
// residuals so jump past
|
||||
"adds %2, #16 \n"
|
||||
"beq 5f \n"
|
||||
@ -430,9 +495,9 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
|
||||
// going to be mirrord
|
||||
"sub %1, #8 \n"
|
||||
"sub %2, #8 \n"
|
||||
// the loop needs to run on blocks of 8. what will be left
|
||||
// the loop needs to run on blocks of 8. what will be left
|
||||
// over is either a negative number, the residuals that need
|
||||
// to be done, or 0. if this isn't subtracted off here the
|
||||
// to be done, or 0. if this isn't subtracted off here the
|
||||
// loop will run one extra time.
|
||||
"sub %3, #8 \n"
|
||||
|
||||
@ -446,7 +511,7 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
|
||||
"vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8
|
||||
"bge 1b \n"
|
||||
|
||||
// add 8 back to the counter. if the result is 0 there is no
|
||||
// add 8 back to the counter. if the result is 0 there is no
|
||||
// residuals so return
|
||||
"adds %3, #8 \n"
|
||||
"beq 4f \n"
|
||||
|
||||
@ -741,9 +741,9 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
}
|
||||
|
||||
// TODO(fbarchard): pass xmm constants to single block of assembly.
|
||||
// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
|
||||
// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
|
||||
// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
|
||||
// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
|
||||
// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
|
||||
// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
|
||||
// and considered unsafe.
|
||||
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
@ -2143,6 +2143,34 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) {
|
||||
}
|
||||
#endif // HAS_COPYROW_X86
|
||||
|
||||
#ifdef HAS_SETROW_X86
|
||||
void SetRow8_X86(uint8* dst, uint32 v32, int width) {
|
||||
size_t width_tmp = static_cast<size_t>(width);
|
||||
asm volatile (
|
||||
"shr $0x2,%1 \n"
|
||||
"rep stosl \n"
|
||||
: "+D"(dst), // %0
|
||||
"+c"(width_tmp) // %1
|
||||
: "a"(v32) // %2
|
||||
: "memory", "cc");
|
||||
}
|
||||
|
||||
void SetRows32_X86(uint8* dst, uint32 v32, int width,
|
||||
int dst_stride, int height) {
|
||||
for (int y = 0; y < height; ++y) {
|
||||
size_t width_tmp = static_cast<size_t>(width);
|
||||
uint32* d = reinterpret_cast<uint32*>(dst);
|
||||
asm volatile (
|
||||
"rep stosl \n"
|
||||
: "+D"(d), // %0
|
||||
"+c"(width_tmp) // %1
|
||||
: "a"(v32) // %2
|
||||
: "memory", "cc");
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
#endif // HAS_SETROW_X86
|
||||
|
||||
#ifdef HAS_YUY2TOYROW_SSE2
|
||||
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
|
||||
asm volatile (
|
||||
@ -2998,7 +3026,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
#endif // HAS_ARGBUNATTENUATEROW_SSE2
|
||||
|
||||
#ifdef HAS_ARGBGRAYROW_SSSE3
|
||||
// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
|
||||
// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
|
||||
CONST vec8 kARGBToGray = {
|
||||
14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
|
||||
};
|
||||
@ -3455,7 +3483,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
|
||||
// Copy ARGB pixels from source image with slope to a row of destination.
|
||||
// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
|
||||
// an error if movq is used. movd %%xmm0,%1
|
||||
// an error if movq is used. movd %%xmm0,%1
|
||||
|
||||
LIBYUV_API
|
||||
void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
|
||||
|
||||
@ -18,6 +18,7 @@ extern "C" {
|
||||
// This module is for Visual C x86.
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
|
||||
// TODO(fbarchard): I420ToRGB24, I420ToRAW
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
|
||||
// Constants for ARGB.
|
||||
@ -2521,6 +2522,54 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) {
|
||||
}
|
||||
#endif // HAS_COPYROW_X86
|
||||
|
||||
#ifdef HAS_SETROW_X86
|
||||
// SetRow8 writes 'count' bytes using a 32 bit value repeated.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void SetRow8_X86(uint8* dst, uint32 v32, int count) {
|
||||
__asm {
|
||||
mov edx, edi
|
||||
mov edi, [esp + 4] // dst
|
||||
mov eax, [esp + 8] // v32
|
||||
mov ecx, [esp + 12] // count
|
||||
shr ecx, 2
|
||||
rep stosd
|
||||
mov edi, edx
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
// SetRow32 writes 'count' words using a 32 bit value repeated.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void SetRows32_X86(uint8* dst, uint32 v32, int width,
|
||||
int dst_stride, int height) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
push ebp
|
||||
mov edi, [esp + 12 + 4] // dst
|
||||
mov eax, [esp + 12 + 8] // v32
|
||||
mov ebp, [esp + 12 + 12] // width
|
||||
mov edx, [esp + 12 + 16] // dst_stride
|
||||
mov esi, [esp + 12 + 20] // height
|
||||
lea ecx, [ebp * 4]
|
||||
sub edx, ecx // stride - width * 4
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
mov ecx, ebp
|
||||
rep stosd
|
||||
add edi, edx
|
||||
sub esi, 1
|
||||
jg convertloop
|
||||
|
||||
pop ebp
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_SETROW_X86
|
||||
|
||||
#ifdef HAS_YUY2TOYROW_SSE2
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void YUY2ToYRow_SSE2(const uint8* src_yuy2,
|
||||
@ -3497,7 +3546,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
|
||||
#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
|
||||
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
|
||||
// Same as Sepia except matrix is provided.
|
||||
// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
|
||||
// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
|
||||
// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
|
||||
|
||||
529
source/scale.cc
529
source/scale.cc
@ -54,514 +54,49 @@ void SetUseReferenceImpl(bool use) {
|
||||
#define HAS_SCALEROWDOWN2_NEON
|
||||
// Note - not static due to reuse in convert for 444 to 420.
|
||||
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
|
||||
uint8* dst, int dst_width) {
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
// load even pixels into q0, odd into q1
|
||||
"vld2.u8 {q0,q1}, [%0]! \n"
|
||||
"vst1.u8 {q0}, [%1]! \n" // store even pixels
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:
|
||||
: "q0", "q1" // Clobber List
|
||||
);
|
||||
}
|
||||
uint8* dst, int dst_width);
|
||||
|
||||
void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width) {
|
||||
asm volatile (
|
||||
// change the stride to row 2 pointer
|
||||
"add %1, %0 \n"
|
||||
"1: \n"
|
||||
"vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post inc
|
||||
"vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post inc
|
||||
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent
|
||||
"vpaddl.u8 q1, q1 \n"
|
||||
// row 2 add adjacent, add row 1 to row 2
|
||||
"vpadal.u8 q0, q2 \n"
|
||||
"vpadal.u8 q1, q3 \n"
|
||||
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
|
||||
"vrshrn.u16 d1, q1, #2 \n"
|
||||
"vst1.u8 {q0}, [%2]! \n"
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_stride), // %1
|
||||
"+r"(dst), // %2
|
||||
"+r"(dst_width) // %3
|
||||
:
|
||||
: "q0", "q1", "q2", "q3" // Clobber List
|
||||
);
|
||||
}
|
||||
uint8* dst, int dst_width);
|
||||
|
||||
#define HAS_SCALEROWDOWN4_NEON
|
||||
static void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
"vld2.u8 {d0, d1}, [%0]! \n"
|
||||
"vtrn.u8 d1, d0 \n"
|
||||
"vshrn.u16 d0, q0, #8 \n"
|
||||
"vst1.u32 {d0[1]}, [%1]! \n"
|
||||
|
||||
"subs %2, #4 \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:
|
||||
: "q0", "q1", "memory", "cc"
|
||||
);
|
||||
}
|
||||
|
||||
static void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"add r4, %0, %3 \n"
|
||||
"add r5, r4, %3 \n"
|
||||
"add %3, r5, %3 \n"
|
||||
"1: \n"
|
||||
"vld1.u8 {q0}, [%0]! \n" // load up 16x4
|
||||
"vld1.u8 {q1}, [r4]! \n"
|
||||
"vld1.u8 {q2}, [r5]! \n"
|
||||
"vld1.u8 {q3}, [%3]! \n"
|
||||
|
||||
"vpaddl.u8 q0, q0 \n"
|
||||
"vpadal.u8 q0, q1 \n"
|
||||
"vpadal.u8 q0, q2 \n"
|
||||
"vpadal.u8 q0, q3 \n"
|
||||
|
||||
"vpaddl.u16 q0, q0 \n"
|
||||
|
||||
"vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
|
||||
|
||||
"vmovn.u16 d0, q0 \n"
|
||||
"vst1.u32 {d0[0]}, [%1]! \n"
|
||||
|
||||
"subs %2, #4 \n"
|
||||
"bgt 1b \n"
|
||||
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "r"(src_stride) // %3
|
||||
: "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
|
||||
);
|
||||
}
|
||||
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
|
||||
#define HAS_SCALEROWDOWN34_NEON
|
||||
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
|
||||
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
|
||||
// to load up the every 4th pixel into a 4 different registers.
|
||||
// Point samples 32 pixels to 24 pixels.
|
||||
static void ScaleRowDown34_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t /* src_stride */,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
|
||||
"vmov d2, d3 \n" // order d0, d1, d2
|
||||
"vst3.u8 {d0, d1, d2}, [%1]! \n"
|
||||
"subs %2, #24 \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:
|
||||
: "d0", "d1", "d2", "d3", "memory", "cc"
|
||||
);
|
||||
}
|
||||
|
||||
static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"vmov.u8 d24, #3 \n"
|
||||
"add %3, %0 \n"
|
||||
"1: \n"
|
||||
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
|
||||
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
|
||||
|
||||
// filter src line 0 with src line 1
|
||||
// expand chars to shorts to allow for room
|
||||
// when adding lines together
|
||||
"vmovl.u8 q8, d4 \n"
|
||||
"vmovl.u8 q9, d5 \n"
|
||||
"vmovl.u8 q10, d6 \n"
|
||||
"vmovl.u8 q11, d7 \n"
|
||||
|
||||
// 3 * line_0 + line_1
|
||||
"vmlal.u8 q8, d0, d24 \n"
|
||||
"vmlal.u8 q9, d1, d24 \n"
|
||||
"vmlal.u8 q10, d2, d24 \n"
|
||||
"vmlal.u8 q11, d3, d24 \n"
|
||||
|
||||
// (3 * line_0 + line_1) >> 2
|
||||
"vqrshrn.u16 d0, q8, #2 \n"
|
||||
"vqrshrn.u16 d1, q9, #2 \n"
|
||||
"vqrshrn.u16 d2, q10, #2 \n"
|
||||
"vqrshrn.u16 d3, q11, #2 \n"
|
||||
|
||||
// a0 = (src[0] * 3 + s[1] * 1) >> 2
|
||||
"vmovl.u8 q8, d1 \n"
|
||||
"vmlal.u8 q8, d0, d24 \n"
|
||||
"vqrshrn.u16 d0, q8, #2 \n"
|
||||
|
||||
// a1 = (src[1] * 1 + s[2] * 1) >> 1
|
||||
"vrhadd.u8 d1, d1, d2 \n"
|
||||
|
||||
// a2 = (src[2] * 1 + s[3] * 3) >> 2
|
||||
"vmovl.u8 q8, d2 \n"
|
||||
"vmlal.u8 q8, d3, d24 \n"
|
||||
"vqrshrn.u16 d2, q8, #2 \n"
|
||||
|
||||
"vst3.u8 {d0, d1, d2}, [%1]! \n"
|
||||
|
||||
"subs %2, #24 \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width), // %2
|
||||
"+r"(src_stride) // %3
|
||||
:
|
||||
: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
|
||||
);
|
||||
}
|
||||
|
||||
static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"vmov.u8 d24, #3 \n"
|
||||
"add %3, %0 \n"
|
||||
"1: \n"
|
||||
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
|
||||
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
|
||||
|
||||
// average src line 0 with src line 1
|
||||
"vrhadd.u8 q0, q0, q2 \n"
|
||||
"vrhadd.u8 q1, q1, q3 \n"
|
||||
|
||||
// a0 = (src[0] * 3 + s[1] * 1) >> 2
|
||||
"vmovl.u8 q3, d1 \n"
|
||||
"vmlal.u8 q3, d0, d24 \n"
|
||||
"vqrshrn.u16 d0, q3, #2 \n"
|
||||
|
||||
// a1 = (src[1] * 1 + s[2] * 1) >> 1
|
||||
"vrhadd.u8 d1, d1, d2 \n"
|
||||
|
||||
// a2 = (src[2] * 1 + s[3] * 3) >> 2
|
||||
"vmovl.u8 q3, d2 \n"
|
||||
"vmlal.u8 q3, d3, d24 \n"
|
||||
"vqrshrn.u16 d2, q3, #2 \n"
|
||||
|
||||
"vst3.u8 {d0, d1, d2}, [%1]! \n"
|
||||
|
||||
"subs %2, #24 \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width), // %2
|
||||
"+r"(src_stride) // %3
|
||||
:
|
||||
: "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
|
||||
);
|
||||
}
|
||||
void ScaleRowDown34_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t /* src_stride */,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
|
||||
#define HAS_SCALEROWDOWN38_NEON
|
||||
const uvec8 kShuf38 =
|
||||
{ 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
|
||||
const uvec8 kShuf38_2 =
|
||||
{ 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
|
||||
const vec16 kMult38_Div6 =
|
||||
{ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
|
||||
65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
|
||||
const vec16 kMult38_Div9 =
|
||||
{ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
|
||||
65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
|
||||
|
||||
// 32 -> 12
|
||||
static void ScaleRowDown38_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t /* src_stride */,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"vld1.u8 {q3}, [%3] \n"
|
||||
"1: \n"
|
||||
"vld1.u8 {d0, d1, d2, d3}, [%0]! \n"
|
||||
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
|
||||
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
|
||||
"vst1.u8 {d4}, [%1]! \n"
|
||||
"vst1.u32 {d5[0]}, [%1]! \n"
|
||||
"subs %2, #12 \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "r"(&kShuf38) // %3
|
||||
: "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown38_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t /* src_stride */,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
// 32x3 -> 12x1
|
||||
static void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"vld1.u16 {q13}, [%4] \n"
|
||||
"vld1.u8 {q14}, [%5] \n"
|
||||
"vld1.u8 {q15}, [%6] \n"
|
||||
"add r4, %0, %3, lsl #1 \n"
|
||||
"add %3, %0 \n"
|
||||
"1: \n"
|
||||
|
||||
// d0 = 00 40 01 41 02 42 03 43
|
||||
// d1 = 10 50 11 51 12 52 13 53
|
||||
// d2 = 20 60 21 61 22 62 23 63
|
||||
// d3 = 30 70 31 71 32 72 33 73
|
||||
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
|
||||
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
|
||||
"vld4.u8 {d16, d17, d18, d19}, [r4]! \n"
|
||||
|
||||
// Shuffle the input data around to get align the data
|
||||
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
|
||||
// d0 = 00 10 01 11 02 12 03 13
|
||||
// d1 = 40 50 41 51 42 52 43 53
|
||||
"vtrn.u8 d0, d1 \n"
|
||||
"vtrn.u8 d4, d5 \n"
|
||||
"vtrn.u8 d16, d17 \n"
|
||||
|
||||
// d2 = 20 30 21 31 22 32 23 33
|
||||
// d3 = 60 70 61 71 62 72 63 73
|
||||
"vtrn.u8 d2, d3 \n"
|
||||
"vtrn.u8 d6, d7 \n"
|
||||
"vtrn.u8 d18, d19 \n"
|
||||
|
||||
// d0 = 00+10 01+11 02+12 03+13
|
||||
// d2 = 40+50 41+51 42+52 43+53
|
||||
"vpaddl.u8 q0, q0 \n"
|
||||
"vpaddl.u8 q2, q2 \n"
|
||||
"vpaddl.u8 q8, q8 \n"
|
||||
|
||||
// d3 = 60+70 61+71 62+72 63+73
|
||||
"vpaddl.u8 d3, d3 \n"
|
||||
"vpaddl.u8 d7, d7 \n"
|
||||
"vpaddl.u8 d19, d19 \n"
|
||||
|
||||
// combine source lines
|
||||
"vadd.u16 q0, q2 \n"
|
||||
"vadd.u16 q0, q8 \n"
|
||||
"vadd.u16 d4, d3, d7 \n"
|
||||
"vadd.u16 d4, d19 \n"
|
||||
|
||||
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
|
||||
// + s[6 + st * 1] + s[7 + st * 1]
|
||||
// + s[6 + st * 2] + s[7 + st * 2]) / 6
|
||||
"vqrdmulh.s16 q2, q2, q13 \n"
|
||||
"vmovn.u16 d4, q2 \n"
|
||||
|
||||
// Shuffle 2,3 reg around so that 2 can be added to the
|
||||
// 0,1 reg and 3 can be added to the 4,5 reg. This
|
||||
// requires expanding from u8 to u16 as the 0,1 and 4,5
|
||||
// registers are already expanded. Then do transposes
|
||||
// to get aligned.
|
||||
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
|
||||
"vmovl.u8 q1, d2 \n"
|
||||
"vmovl.u8 q3, d6 \n"
|
||||
"vmovl.u8 q9, d18 \n"
|
||||
|
||||
// combine source lines
|
||||
"vadd.u16 q1, q3 \n"
|
||||
"vadd.u16 q1, q9 \n"
|
||||
|
||||
// d4 = xx 20 xx 30 xx 22 xx 32
|
||||
// d5 = xx 21 xx 31 xx 23 xx 33
|
||||
"vtrn.u32 d2, d3 \n"
|
||||
|
||||
// d4 = xx 20 xx 21 xx 22 xx 23
|
||||
// d5 = xx 30 xx 31 xx 32 xx 33
|
||||
"vtrn.u16 d2, d3 \n"
|
||||
|
||||
// 0+1+2, 3+4+5
|
||||
"vadd.u16 q0, q1 \n"
|
||||
|
||||
// Need to divide, but can't downshift as the the value
|
||||
// isn't a power of 2. So multiply by 65536 / n
|
||||
// and take the upper 16 bits.
|
||||
"vqrdmulh.s16 q0, q0, q15 \n"
|
||||
|
||||
// Align for table lookup, vtbl requires registers to
|
||||
// be adjacent
|
||||
"vmov.u8 d2, d4 \n"
|
||||
|
||||
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
|
||||
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
|
||||
|
||||
"vst1.u8 {d3}, [%1]! \n"
|
||||
"vst1.u32 {d4[0]}, [%1]! \n"
|
||||
"subs %2, #12 \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width), // %2
|
||||
"+r"(src_stride) // %3
|
||||
: "r"(&kMult38_Div6), // %4
|
||||
"r"(&kShuf38_2), // %5
|
||||
"r"(&kMult38_Div9) // %6
|
||||
: "r4", "q0", "q1", "q2", "q3", "q8", "q9",
|
||||
"q13", "q14", "q15", "memory", "cc"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
// 32x2 -> 12x1
|
||||
static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"vld1.u16 {q13}, [%4] \n"
|
||||
"vld1.u8 {q14}, [%5] \n"
|
||||
"add %3, %0 \n"
|
||||
"1: \n"
|
||||
|
||||
// d0 = 00 40 01 41 02 42 03 43
|
||||
// d1 = 10 50 11 51 12 52 13 53
|
||||
// d2 = 20 60 21 61 22 62 23 63
|
||||
// d3 = 30 70 31 71 32 72 33 73
|
||||
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
|
||||
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
|
||||
|
||||
// Shuffle the input data around to get align the data
|
||||
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
|
||||
// d0 = 00 10 01 11 02 12 03 13
|
||||
// d1 = 40 50 41 51 42 52 43 53
|
||||
"vtrn.u8 d0, d1 \n"
|
||||
"vtrn.u8 d4, d5 \n"
|
||||
|
||||
// d2 = 20 30 21 31 22 32 23 33
|
||||
// d3 = 60 70 61 71 62 72 63 73
|
||||
"vtrn.u8 d2, d3 \n"
|
||||
"vtrn.u8 d6, d7 \n"
|
||||
|
||||
// d0 = 00+10 01+11 02+12 03+13
|
||||
// d2 = 40+50 41+51 42+52 43+53
|
||||
"vpaddl.u8 q0, q0 \n"
|
||||
"vpaddl.u8 q2, q2 \n"
|
||||
|
||||
// d3 = 60+70 61+71 62+72 63+73
|
||||
"vpaddl.u8 d3, d3 \n"
|
||||
"vpaddl.u8 d7, d7 \n"
|
||||
|
||||
// combine source lines
|
||||
"vadd.u16 q0, q2 \n"
|
||||
"vadd.u16 d4, d3, d7 \n"
|
||||
|
||||
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
|
||||
"vqrshrn.u16 d4, q2, #2 \n"
|
||||
|
||||
// Shuffle 2,3 reg around so that 2 can be added to the
|
||||
// 0,1 reg and 3 can be added to the 4,5 reg. This
|
||||
// requires expanding from u8 to u16 as the 0,1 and 4,5
|
||||
// registers are already expanded. Then do transposes
|
||||
// to get aligned.
|
||||
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
|
||||
"vmovl.u8 q1, d2 \n"
|
||||
"vmovl.u8 q3, d6 \n"
|
||||
|
||||
// combine source lines
|
||||
"vadd.u16 q1, q3 \n"
|
||||
|
||||
// d4 = xx 20 xx 30 xx 22 xx 32
|
||||
// d5 = xx 21 xx 31 xx 23 xx 33
|
||||
"vtrn.u32 d2, d3 \n"
|
||||
|
||||
// d4 = xx 20 xx 21 xx 22 xx 23
|
||||
// d5 = xx 30 xx 31 xx 32 xx 33
|
||||
"vtrn.u16 d2, d3 \n"
|
||||
|
||||
// 0+1+2, 3+4+5
|
||||
"vadd.u16 q0, q1 \n"
|
||||
|
||||
// Need to divide, but can't downshift as the the value
|
||||
// isn't a power of 2. So multiply by 65536 / n
|
||||
// and take the upper 16 bits.
|
||||
"vqrdmulh.s16 q0, q0, q13 \n"
|
||||
|
||||
// Align for table lookup, vtbl requires registers to
|
||||
// be adjacent
|
||||
"vmov.u8 d2, d4 \n"
|
||||
|
||||
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
|
||||
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
|
||||
|
||||
"vst1.u8 {d3}, [%1]! \n"
|
||||
"vst1.u32 {d4[0]}, [%1]! \n"
|
||||
"subs %2, #12 \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width), // %2
|
||||
"+r"(src_stride) // %3
|
||||
: "r"(&kMult38_Div6), // %4
|
||||
"r"(&kShuf38_2) // %5
|
||||
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
// 16x2 -> 16x1
|
||||
#define HAS_SCALEFILTERROWS_NEON
|
||||
static void ScaleFilterRows_NEON(uint8* dst_ptr,
|
||||
const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
int dst_width, int source_y_fraction) {
|
||||
asm volatile (
|
||||
"cmp %4, #0 \n"
|
||||
"beq 2f \n"
|
||||
"add %2, %1 \n"
|
||||
"cmp %4, #128 \n"
|
||||
"beq 3f \n"
|
||||
|
||||
"vdup.8 d5, %4 \n"
|
||||
"rsb %4, #256 \n"
|
||||
"vdup.8 d4, %4 \n"
|
||||
"1: \n"
|
||||
"vld1.u8 {q0}, [%1]! \n"
|
||||
"vld1.u8 {q1}, [%2]! \n"
|
||||
"subs %3, #16 \n"
|
||||
"vmull.u8 q13, d0, d4 \n"
|
||||
"vmull.u8 q14, d1, d4 \n"
|
||||
"vmlal.u8 q13, d2, d5 \n"
|
||||
"vmlal.u8 q14, d3, d5 \n"
|
||||
"vrshrn.u16 d0, q13, #8 \n"
|
||||
"vrshrn.u16 d1, q14, #8 \n"
|
||||
"vst1.u8 {q0}, [%0]! \n"
|
||||
"bgt 1b \n"
|
||||
"b 4f \n"
|
||||
|
||||
"2: \n"
|
||||
"vld1.u8 {q0}, [%1]! \n"
|
||||
"subs %3, #16 \n"
|
||||
"vst1.u8 {q0}, [%0]! \n"
|
||||
"bgt 2b \n"
|
||||
"b 4f \n"
|
||||
|
||||
"3: \n"
|
||||
"vld1.u8 {q0}, [%1]! \n"
|
||||
"vld1.u8 {q1}, [%2]! \n"
|
||||
"subs %3, #16 \n"
|
||||
"vrhadd.u8 q0, q1 \n"
|
||||
"vst1.u8 {q0}, [%0]! \n"
|
||||
"bgt 3b \n"
|
||||
"4: \n"
|
||||
"vst1.u8 {d1[7]}, [%0] \n"
|
||||
: "+r"(dst_ptr), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"+r"(src_stride), // %2
|
||||
"+r"(dst_width), // %3
|
||||
"+r"(source_y_fraction) // %4
|
||||
:
|
||||
: "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
|
||||
);
|
||||
}
|
||||
void ScaleFilterRows_NEON(uint8* dst_ptr,
|
||||
const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
int dst_width, int source_y_fraction);
|
||||
|
||||
/**
|
||||
* SSE2 downscalers with interpolation.
|
||||
@ -1010,7 +545,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
|
||||
#define HAS_SCALEROWDOWN34_SSSE3
|
||||
// Point samples 32 pixels to 24 pixels.
|
||||
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
|
||||
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
|
||||
// Then shuffled to do the scaling.
|
||||
|
||||
// Note that movdqa+palign may be better than movdqu.
|
||||
@ -1049,7 +584,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
}
|
||||
|
||||
// Blends 32x2 rectangle to 24x1
|
||||
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
|
||||
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
|
||||
// Then shuffled to do the scaling.
|
||||
|
||||
// Register usage:
|
||||
@ -3420,7 +2955,7 @@ int I420Scale(const uint8* src_y, int src_stride_y,
|
||||
dst_halfwidth = dst_width >> 1;
|
||||
}
|
||||
// If caller used height / 2 when computing src_v, it will point into what
|
||||
// should be the src_u plane. Detect this and reduce halfheight to match.
|
||||
// should be the src_u plane. Detect this and reduce halfheight to match.
|
||||
int uv_src_plane_size = src_halfwidth * src_halfheight;
|
||||
if ((src_height & 1) &&
|
||||
(src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
|
||||
@ -3484,7 +3019,7 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
|
||||
dst_halfwidth = dst_width >> 1;
|
||||
}
|
||||
// If caller used height / 2 when computing src_v, it will point into what
|
||||
// should be the src_u plane. Detect this and reduce halfheight to match.
|
||||
// should be the src_u plane. Detect this and reduce halfheight to match.
|
||||
int uv_src_plane_size = src_halfwidth * src_halfheight;
|
||||
if ((src_height & 1) &&
|
||||
(src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
|
||||
|
||||
534
source/scale_neon.cc
Normal file
534
source/scale_neon.cc
Normal file
@ -0,0 +1,534 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for GCC Neon
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
|
||||
|
||||
/**
|
||||
* NEON downscalers with interpolation.
|
||||
*
|
||||
* Provided by Fritz Koenig
|
||||
*
|
||||
*/
|
||||
|
||||
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
|
||||
uint8* dst, int dst_width) {
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
// load even pixels into q0, odd into q1
|
||||
"vld2.u8 {q0,q1}, [%0]! \n"
|
||||
"vst1.u8 {q0}, [%1]! \n" // store even pixels
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:
|
||||
: "q0", "q1" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width) {
|
||||
asm volatile (
|
||||
// change the stride to row 2 pointer
|
||||
"add %1, %0 \n"
|
||||
"1: \n"
|
||||
"vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post inc
|
||||
"vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post inc
|
||||
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent
|
||||
"vpaddl.u8 q1, q1 \n"
|
||||
"vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
|
||||
"vpadal.u8 q1, q3 \n"
|
||||
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
|
||||
"vrshrn.u16 d1, q1, #2 \n"
|
||||
"vst1.u8 {q0}, [%2]! \n"
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_stride), // %1
|
||||
"+r"(dst), // %2
|
||||
"+r"(dst_width) // %3
|
||||
:
|
||||
: "q0", "q1", "q2", "q3" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
"vld2.u8 {d0, d1}, [%0]! \n"
|
||||
"vtrn.u8 d1, d0 \n"
|
||||
"vshrn.u16 d0, q0, #8 \n"
|
||||
"vst1.u32 {d0[1]}, [%1]! \n"
|
||||
"subs %2, #4 \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:
|
||||
: "q0", "q1", "memory", "cc"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"add r4, %0, %3 \n"
|
||||
"add r5, r4, %3 \n"
|
||||
"add %3, r5, %3 \n"
|
||||
"1: \n"
|
||||
"vld1.u8 {q0}, [%0]! \n" // load up 16x4
|
||||
"vld1.u8 {q1}, [r4]! \n"
|
||||
"vld1.u8 {q2}, [r5]! \n"
|
||||
"vld1.u8 {q3}, [%3]! \n"
|
||||
"vpaddl.u8 q0, q0 \n"
|
||||
"vpadal.u8 q0, q1 \n"
|
||||
"vpadal.u8 q0, q2 \n"
|
||||
"vpadal.u8 q0, q3 \n"
|
||||
"vpaddl.u16 q0, q0 \n"
|
||||
"vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
|
||||
"vmovn.u16 d0, q0 \n"
|
||||
"vst1.u32 {d0[0]}, [%1]! \n"
|
||||
"subs %2, #4 \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "r"(src_stride) // %3
|
||||
: "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
|
||||
);
|
||||
}
|
||||
|
||||
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
|
||||
// to load up the every 4th pixel into a 4 different registers.
|
||||
// Point samples 32 pixels to 24 pixels.
|
||||
void ScaleRowDown34_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t /* src_stride */,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
|
||||
"vmov d2, d3 \n" // order d0, d1, d2
|
||||
"vst3.u8 {d0, d1, d2}, [%1]! \n"
|
||||
"subs %2, #24 \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:
|
||||
: "d0", "d1", "d2", "d3", "memory", "cc"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"vmov.u8 d24, #3 \n"
|
||||
"add %3, %0 \n"
|
||||
"1: \n"
|
||||
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
|
||||
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
|
||||
|
||||
// filter src line 0 with src line 1
|
||||
// expand chars to shorts to allow for room
|
||||
// when adding lines together
|
||||
"vmovl.u8 q8, d4 \n"
|
||||
"vmovl.u8 q9, d5 \n"
|
||||
"vmovl.u8 q10, d6 \n"
|
||||
"vmovl.u8 q11, d7 \n"
|
||||
|
||||
// 3 * line_0 + line_1
|
||||
"vmlal.u8 q8, d0, d24 \n"
|
||||
"vmlal.u8 q9, d1, d24 \n"
|
||||
"vmlal.u8 q10, d2, d24 \n"
|
||||
"vmlal.u8 q11, d3, d24 \n"
|
||||
|
||||
// (3 * line_0 + line_1) >> 2
|
||||
"vqrshrn.u16 d0, q8, #2 \n"
|
||||
"vqrshrn.u16 d1, q9, #2 \n"
|
||||
"vqrshrn.u16 d2, q10, #2 \n"
|
||||
"vqrshrn.u16 d3, q11, #2 \n"
|
||||
|
||||
// a0 = (src[0] * 3 + s[1] * 1) >> 2
|
||||
"vmovl.u8 q8, d1 \n"
|
||||
"vmlal.u8 q8, d0, d24 \n"
|
||||
"vqrshrn.u16 d0, q8, #2 \n"
|
||||
|
||||
// a1 = (src[1] * 1 + s[2] * 1) >> 1
|
||||
"vrhadd.u8 d1, d1, d2 \n"
|
||||
|
||||
// a2 = (src[2] * 1 + s[3] * 3) >> 2
|
||||
"vmovl.u8 q8, d2 \n"
|
||||
"vmlal.u8 q8, d3, d24 \n"
|
||||
"vqrshrn.u16 d2, q8, #2 \n"
|
||||
|
||||
"vst3.u8 {d0, d1, d2}, [%1]! \n"
|
||||
|
||||
"subs %2, #24 \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width), // %2
|
||||
"+r"(src_stride) // %3
|
||||
:
|
||||
: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"vmov.u8 d24, #3 \n"
|
||||
"add %3, %0 \n"
|
||||
"1: \n"
|
||||
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
|
||||
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
|
||||
|
||||
// average src line 0 with src line 1
|
||||
"vrhadd.u8 q0, q0, q2 \n"
|
||||
"vrhadd.u8 q1, q1, q3 \n"
|
||||
|
||||
// a0 = (src[0] * 3 + s[1] * 1) >> 2
|
||||
"vmovl.u8 q3, d1 \n"
|
||||
"vmlal.u8 q3, d0, d24 \n"
|
||||
"vqrshrn.u16 d0, q3, #2 \n"
|
||||
|
||||
// a1 = (src[1] * 1 + s[2] * 1) >> 1
|
||||
"vrhadd.u8 d1, d1, d2 \n"
|
||||
|
||||
// a2 = (src[2] * 1 + s[3] * 3) >> 2
|
||||
"vmovl.u8 q3, d2 \n"
|
||||
"vmlal.u8 q3, d3, d24 \n"
|
||||
"vqrshrn.u16 d2, q3, #2 \n"
|
||||
|
||||
"vst3.u8 {d0, d1, d2}, [%1]! \n"
|
||||
|
||||
"subs %2, #24 \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width), // %2
|
||||
"+r"(src_stride) // %3
|
||||
:
|
||||
: "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
|
||||
);
|
||||
}
|
||||
|
||||
#define HAS_SCALEROWDOWN38_NEON
|
||||
const uvec8 kShuf38 =
|
||||
{ 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
|
||||
const uvec8 kShuf38_2 =
|
||||
{ 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
|
||||
const vec16 kMult38_Div6 =
|
||||
{ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
|
||||
65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
|
||||
const vec16 kMult38_Div9 =
|
||||
{ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
|
||||
65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
|
||||
|
||||
// 32 -> 12
|
||||
void ScaleRowDown38_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t /* src_stride */,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"vld1.u8 {q3}, [%3] \n"
|
||||
"1: \n"
|
||||
"vld1.u8 {d0, d1, d2, d3}, [%0]! \n"
|
||||
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
|
||||
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
|
||||
"vst1.u8 {d4}, [%1]! \n"
|
||||
"vst1.u32 {d5[0]}, [%1]! \n"
|
||||
"subs %2, #12 \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "r"(&kShuf38) // %3
|
||||
: "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
|
||||
);
|
||||
}
|
||||
|
||||
// 32x3 -> 12x1
|
||||
void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"vld1.u16 {q13}, [%4] \n"
|
||||
"vld1.u8 {q14}, [%5] \n"
|
||||
"vld1.u8 {q15}, [%6] \n"
|
||||
"add r4, %0, %3, lsl #1 \n"
|
||||
"add %3, %0 \n"
|
||||
"1: \n"
|
||||
|
||||
// d0 = 00 40 01 41 02 42 03 43
|
||||
// d1 = 10 50 11 51 12 52 13 53
|
||||
// d2 = 20 60 21 61 22 62 23 63
|
||||
// d3 = 30 70 31 71 32 72 33 73
|
||||
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
|
||||
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
|
||||
"vld4.u8 {d16, d17, d18, d19}, [r4]! \n"
|
||||
|
||||
// Shuffle the input data around to get align the data
|
||||
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
|
||||
// d0 = 00 10 01 11 02 12 03 13
|
||||
// d1 = 40 50 41 51 42 52 43 53
|
||||
"vtrn.u8 d0, d1 \n"
|
||||
"vtrn.u8 d4, d5 \n"
|
||||
"vtrn.u8 d16, d17 \n"
|
||||
|
||||
// d2 = 20 30 21 31 22 32 23 33
|
||||
// d3 = 60 70 61 71 62 72 63 73
|
||||
"vtrn.u8 d2, d3 \n"
|
||||
"vtrn.u8 d6, d7 \n"
|
||||
"vtrn.u8 d18, d19 \n"
|
||||
|
||||
// d0 = 00+10 01+11 02+12 03+13
|
||||
// d2 = 40+50 41+51 42+52 43+53
|
||||
"vpaddl.u8 q0, q0 \n"
|
||||
"vpaddl.u8 q2, q2 \n"
|
||||
"vpaddl.u8 q8, q8 \n"
|
||||
|
||||
// d3 = 60+70 61+71 62+72 63+73
|
||||
"vpaddl.u8 d3, d3 \n"
|
||||
"vpaddl.u8 d7, d7 \n"
|
||||
"vpaddl.u8 d19, d19 \n"
|
||||
|
||||
// combine source lines
|
||||
"vadd.u16 q0, q2 \n"
|
||||
"vadd.u16 q0, q8 \n"
|
||||
"vadd.u16 d4, d3, d7 \n"
|
||||
"vadd.u16 d4, d19 \n"
|
||||
|
||||
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
|
||||
// + s[6 + st * 1] + s[7 + st * 1]
|
||||
// + s[6 + st * 2] + s[7 + st * 2]) / 6
|
||||
"vqrdmulh.s16 q2, q2, q13 \n"
|
||||
"vmovn.u16 d4, q2 \n"
|
||||
|
||||
// Shuffle 2,3 reg around so that 2 can be added to the
|
||||
// 0,1 reg and 3 can be added to the 4,5 reg. This
|
||||
// requires expanding from u8 to u16 as the 0,1 and 4,5
|
||||
// registers are already expanded. Then do transposes
|
||||
// to get aligned.
|
||||
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
|
||||
"vmovl.u8 q1, d2 \n"
|
||||
"vmovl.u8 q3, d6 \n"
|
||||
"vmovl.u8 q9, d18 \n"
|
||||
|
||||
// combine source lines
|
||||
"vadd.u16 q1, q3 \n"
|
||||
"vadd.u16 q1, q9 \n"
|
||||
|
||||
// d4 = xx 20 xx 30 xx 22 xx 32
|
||||
// d5 = xx 21 xx 31 xx 23 xx 33
|
||||
"vtrn.u32 d2, d3 \n"
|
||||
|
||||
// d4 = xx 20 xx 21 xx 22 xx 23
|
||||
// d5 = xx 30 xx 31 xx 32 xx 33
|
||||
"vtrn.u16 d2, d3 \n"
|
||||
|
||||
// 0+1+2, 3+4+5
|
||||
"vadd.u16 q0, q1 \n"
|
||||
|
||||
// Need to divide, but can't downshift as the the value
|
||||
// isn't a power of 2. So multiply by 65536 / n
|
||||
// and take the upper 16 bits.
|
||||
"vqrdmulh.s16 q0, q0, q15 \n"
|
||||
|
||||
// Align for table lookup, vtbl requires registers to
|
||||
// be adjacent
|
||||
"vmov.u8 d2, d4 \n"
|
||||
|
||||
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
|
||||
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
|
||||
|
||||
"vst1.u8 {d3}, [%1]! \n"
|
||||
"vst1.u32 {d4[0]}, [%1]! \n"
|
||||
"subs %2, #12 \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width), // %2
|
||||
"+r"(src_stride) // %3
|
||||
: "r"(&kMult38_Div6), // %4
|
||||
"r"(&kShuf38_2), // %5
|
||||
"r"(&kMult38_Div9) // %6
|
||||
: "r4", "q0", "q1", "q2", "q3", "q8", "q9",
|
||||
"q13", "q14", "q15", "memory", "cc"
|
||||
);
|
||||
}
|
||||
|
||||
// 32x2 -> 12x1
|
||||
void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"vld1.u16 {q13}, [%4] \n"
|
||||
"vld1.u8 {q14}, [%5] \n"
|
||||
"add %3, %0 \n"
|
||||
"1: \n"
|
||||
|
||||
// d0 = 00 40 01 41 02 42 03 43
|
||||
// d1 = 10 50 11 51 12 52 13 53
|
||||
// d2 = 20 60 21 61 22 62 23 63
|
||||
// d3 = 30 70 31 71 32 72 33 73
|
||||
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
|
||||
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
|
||||
|
||||
// Shuffle the input data around to get align the data
|
||||
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
|
||||
// d0 = 00 10 01 11 02 12 03 13
|
||||
// d1 = 40 50 41 51 42 52 43 53
|
||||
"vtrn.u8 d0, d1 \n"
|
||||
"vtrn.u8 d4, d5 \n"
|
||||
|
||||
// d2 = 20 30 21 31 22 32 23 33
|
||||
// d3 = 60 70 61 71 62 72 63 73
|
||||
"vtrn.u8 d2, d3 \n"
|
||||
"vtrn.u8 d6, d7 \n"
|
||||
|
||||
// d0 = 00+10 01+11 02+12 03+13
|
||||
// d2 = 40+50 41+51 42+52 43+53
|
||||
"vpaddl.u8 q0, q0 \n"
|
||||
"vpaddl.u8 q2, q2 \n"
|
||||
|
||||
// d3 = 60+70 61+71 62+72 63+73
|
||||
"vpaddl.u8 d3, d3 \n"
|
||||
"vpaddl.u8 d7, d7 \n"
|
||||
|
||||
// combine source lines
|
||||
"vadd.u16 q0, q2 \n"
|
||||
"vadd.u16 d4, d3, d7 \n"
|
||||
|
||||
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
|
||||
"vqrshrn.u16 d4, q2, #2 \n"
|
||||
|
||||
// Shuffle 2,3 reg around so that 2 can be added to the
|
||||
// 0,1 reg and 3 can be added to the 4,5 reg. This
|
||||
// requires expanding from u8 to u16 as the 0,1 and 4,5
|
||||
// registers are already expanded. Then do transposes
|
||||
// to get aligned.
|
||||
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
|
||||
"vmovl.u8 q1, d2 \n"
|
||||
"vmovl.u8 q3, d6 \n"
|
||||
|
||||
// combine source lines
|
||||
"vadd.u16 q1, q3 \n"
|
||||
|
||||
// d4 = xx 20 xx 30 xx 22 xx 32
|
||||
// d5 = xx 21 xx 31 xx 23 xx 33
|
||||
"vtrn.u32 d2, d3 \n"
|
||||
|
||||
// d4 = xx 20 xx 21 xx 22 xx 23
|
||||
// d5 = xx 30 xx 31 xx 32 xx 33
|
||||
"vtrn.u16 d2, d3 \n"
|
||||
|
||||
// 0+1+2, 3+4+5
|
||||
"vadd.u16 q0, q1 \n"
|
||||
|
||||
// Need to divide, but can't downshift as the the value
|
||||
// isn't a power of 2. So multiply by 65536 / n
|
||||
// and take the upper 16 bits.
|
||||
"vqrdmulh.s16 q0, q0, q13 \n"
|
||||
|
||||
// Align for table lookup, vtbl requires registers to
|
||||
// be adjacent
|
||||
"vmov.u8 d2, d4 \n"
|
||||
|
||||
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
|
||||
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
|
||||
|
||||
"vst1.u8 {d3}, [%1]! \n"
|
||||
"vst1.u32 {d4[0]}, [%1]! \n"
|
||||
"subs %2, #12 \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width), // %2
|
||||
"+r"(src_stride) // %3
|
||||
: "r"(&kMult38_Div6), // %4
|
||||
"r"(&kShuf38_2) // %5
|
||||
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
|
||||
);
|
||||
}
|
||||
|
||||
// 16x2 -> 16x1
|
||||
void ScaleFilterRows_NEON(uint8* dst_ptr,
|
||||
const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
int dst_width, int source_y_fraction) {
|
||||
asm volatile (
|
||||
"cmp %4, #0 \n"
|
||||
"beq 2f \n"
|
||||
"add %2, %1 \n"
|
||||
"cmp %4, #128 \n"
|
||||
"beq 3f \n"
|
||||
|
||||
"vdup.8 d5, %4 \n"
|
||||
"rsb %4, #256 \n"
|
||||
"vdup.8 d4, %4 \n"
|
||||
"1: \n"
|
||||
"vld1.u8 {q0}, [%1]! \n"
|
||||
"vld1.u8 {q1}, [%2]! \n"
|
||||
"subs %3, #16 \n"
|
||||
"vmull.u8 q13, d0, d4 \n"
|
||||
"vmull.u8 q14, d1, d4 \n"
|
||||
"vmlal.u8 q13, d2, d5 \n"
|
||||
"vmlal.u8 q14, d3, d5 \n"
|
||||
"vrshrn.u16 d0, q13, #8 \n"
|
||||
"vrshrn.u16 d1, q14, #8 \n"
|
||||
"vst1.u8 {q0}, [%0]! \n"
|
||||
"bgt 1b \n"
|
||||
"b 4f \n"
|
||||
|
||||
"2: \n"
|
||||
"vld1.u8 {q0}, [%1]! \n"
|
||||
"subs %3, #16 \n"
|
||||
"vst1.u8 {q0}, [%0]! \n"
|
||||
"bgt 2b \n"
|
||||
"b 4f \n"
|
||||
|
||||
"3: \n"
|
||||
"vld1.u8 {q0}, [%1]! \n"
|
||||
"vld1.u8 {q1}, [%2]! \n"
|
||||
"subs %3, #16 \n"
|
||||
"vrhadd.u8 q0, q1 \n"
|
||||
"vst1.u8 {q0}, [%0]! \n"
|
||||
"bgt 3b \n"
|
||||
"4: \n"
|
||||
"vst1.u8 {d1[7]}, [%0] \n"
|
||||
: "+r"(dst_ptr), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"+r"(src_stride), // %2
|
||||
"+r"(dst_width), // %3
|
||||
"+r"(source_y_fraction) // %4
|
||||
:
|
||||
: "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
|
||||
);
|
||||
}
|
||||
|
||||
#endif // __ARM_NEON__
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
|
||||
@ -76,8 +76,8 @@ static int ARGBTestRotate(int src_width, int src_height,
|
||||
printf("filter %d - %8d us C - %8d us OPT\n",
|
||||
mode, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
|
||||
|
||||
// C version may be a little off from the optimized. Order of
|
||||
// operations may introduce rounding somewhere. So do a difference
|
||||
// C version may be a little off from the optimized. Order of
|
||||
// operations may introduce rounding somewhere. So do a difference
|
||||
// of the buffers and look to see that the max difference isn't
|
||||
// over 2.
|
||||
int max_diff = 0;
|
||||
|
||||
@ -80,8 +80,8 @@ static int ARGBTestFilter(int src_width, int src_height,
|
||||
printf("filter %d - %8d us C - %8d us OPT\n",
|
||||
f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
|
||||
|
||||
// C version may be a little off from the optimized. Order of
|
||||
// operations may introduce rounding somewhere. So do a difference
|
||||
// C version may be a little off from the optimized. Order of
|
||||
// operations may introduce rounding somewhere. So do a difference
|
||||
// of the buffers and look to see that the max difference isn't
|
||||
// over 2.
|
||||
int max_diff = 0;
|
||||
|
||||
@ -118,8 +118,8 @@ static int TestFilter(int src_width, int src_height,
|
||||
printf("filter %d - %8d us C - %8d us OPT\n",
|
||||
f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
|
||||
|
||||
// C version may be a little off from the optimized. Order of
|
||||
// operations may introduce rounding somewhere. So do a difference
|
||||
// C version may be a little off from the optimized. Order of
|
||||
// operations may introduce rounding somewhere. So do a difference
|
||||
// of the buffers and look to see that the max difference isn't
|
||||
// over 2.
|
||||
int max_diff = 0;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user