Move Neon source to its own files.

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/860009

git-svn-id: http://libyuv.googlecode.com/svn/trunk@396 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
fbarchard@google.com 2012-10-09 00:05:29 +00:00
parent 4807dea4e7
commit 64ce0ab544
32 changed files with 1262 additions and 868 deletions

View File

@ -22,8 +22,10 @@ LOCAL_SRC_FILES := \
ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
LOCAL_CFLAGS += -DLIBYUV_NEON
LOCAL_SRC_FILES += \
source/compare_neon.cc \
source/rotate_neon.cc.neon \
source/row_neon.cc.neon
source/row_neon.cc.neon \
source/scale_neon.cc
endif
LOCAL_C_INCLUDES += $(LOCAL_PATH)/include

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 395
Version: 396
License: BSD
License File: LICENSE

View File

@ -18,7 +18,7 @@ namespace libyuv {
extern "C" {
#endif
// Compute a hash for specified memory. Seed of 5381 recommended.
// Compute a hash for specified memory. Seed of 5381 recommended.
LIBYUV_API
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);

View File

@ -73,7 +73,7 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert NV12 to I420. Also used for NV21.
// Convert NV12 to I420. Also used for NV21.
LIBYUV_API
int NV12ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
@ -229,7 +229,7 @@ int MJPGToI420(const uint8* sample, size_t sample_size,
// Must be less than or equal to src_width/src_height
// Cropping parameters are pre-rotation.
// "rotation" can be 0, 90, 180 or 270.
// "format" is a fourcc. ie 'I420', 'YUY2'
// "format" is a fourcc. ie 'I420', 'YUY2'
// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
LIBYUV_API
int ConvertToI420(const uint8* src_frame, size_t src_size,

View File

@ -19,7 +19,7 @@
// TODO(fbarchard): This set of functions should exactly match convert.h
// Add missing V210 and Q420.
// TODO(fbarchard): Add tests. Create random content of right size and convert
// TODO(fbarchard): Add tests. Create random content of right size and convert
// with C vs Opt and or to I420 and compare.
// TODO(fbarchard): Some of these functions lack parameter setting.
@ -75,7 +75,7 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert I400 to ARGB. Reverse of ARGBToI400.
// Convert I400 to ARGB. Reverse of ARGBToI400.
LIBYUV_API
int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
@ -209,7 +209,7 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
// Must be less than or equal to src_width/src_height
// Cropping parameters are pre-rotation.
// "rotation" can be 0, 90, 180 or 270.
// "format" is a fourcc. ie 'I420', 'YUY2'
// "format" is a fourcc. ie 'I420', 'YUY2'
// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
LIBYUV_API
int ConvertToARGB(const uint8* src_frame, size_t src_size,

View File

@ -50,7 +50,7 @@ int I420ToI411(const uint8* src_y, int src_stride_y,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
LIBYUV_API
int I400Copy(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,

View File

@ -58,7 +58,7 @@ static __inline int TestCpuFlag(int test_flag) {
LIBYUV_API
void MaskCpuFlags(int enable_flags);
// Low level cpuid for X86. Returns zeros on other CPUs.
// Low level cpuid for X86. Returns zeros on other CPUs.
LIBYUV_API
void CpuId(int cpu_info[4], int info_type);

View File

@ -27,13 +27,31 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
int width, int height,
uint32 value);
// Alias.
#define I400ToI400 CopyPlane
// Copy a plane of data (I420 to I400).
LIBYUV_API
void CopyPlane(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
// Convert I420 to I400. (calls CopyPlane ignoring u/v).
// Convert YUY2 to I422.
LIBYUV_API
int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert UYVY to I422.
int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert I420 to I400. (calls CopyPlane ignoring u/v).
LIBYUV_API
int I420ToI400(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
@ -196,7 +214,7 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
const uint8* table_argb,
int x, int y, int width, int height);
// Quantize a rectangle of ARGB. Alpha unaffected.
// Quantize a rectangle of ARGB. Alpha unaffected.
// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
// interval_size should be a value between 1 and 255.
// interval_offset should be a value between 0 and 255.
@ -261,7 +279,7 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
int w, int h, int dw, int dh);
// Computes table of cumulative sum for image where the value is the sum
// of all values above and to the left of the entry. Used by ARGBBlur.
// of all values above and to the left of the entry. Used by ARGBBlur.
LIBYUV_API
int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
int32* dst_cumsum, int dst_stride32_cumsum,
@ -299,7 +317,7 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
#define YUV_DISABLE_ASM
#endif
// Row functions for copying a pixels from a source with a slope to a row
// of destination. Useful for scaling, rotation, mirror, texture mapping.
// of destination. Useful for scaling, rotation, mirror, texture mapping.
LIBYUV_API
void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width);

View File

@ -66,6 +66,7 @@ extern "C" {
#define HAS_RAWTOARGBROW_SSSE3
#define HAS_RGB24TOARGBROW_SSSE3
#define HAS_RGB565TOARGBROW_SSE2
#define HAS_SETROW_X86
#define HAS_SPLITUV_SSE2
#define HAS_UYVYTOUV422ROW_SSE2
#define HAS_UYVYTOUVROW_SSE2
@ -76,13 +77,13 @@ extern "C" {
#define HAS_YUY2TOYROW_SSE2
// Effects
#define HAS_ARGBMIRRORROW_SSSE3
#define HAS_ARGBAFFINEROW_SSE2
#define HAS_ARGBATTENUATEROW_SSSE3
#define HAS_ARGBBLENDROW_SSSE3
#define HAS_ARGBCOLORMATRIXROW_SSSE3
#define HAS_ARGBGRAYROW_SSSE3
#define HAS_ARGBINTERPOLATEROW_SSSE3
#define HAS_ARGBMIRRORROW_SSSE3
#define HAS_ARGBQUANTIZEROW_SSE2
#define HAS_ARGBSEPIAROW_SSSE3
#define HAS_ARGBSHADE_SSE2
@ -93,9 +94,9 @@ extern "C" {
// The following are Windows only:
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_ABGRTOARGBROW_SSSE3
#define HAS_ARGBCOLORTABLEROW_X86
#define HAS_I422TORGBAROW_SSSE3
#define HAS_ABGRTOARGBROW_SSSE3
#define HAS_RGBATOARGBROW_SSSE3
#define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOYROW_SSSE3
@ -105,36 +106,42 @@ extern "C" {
#if !defined(YUV_DISABLE_ASM) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
!defined(LIBYUV_SSSE3_ONLY)
#define HAS_MIRRORROW_SSE2
#define HAS_ARGBATTENUATE_SSE2
#define HAS_ARGBBLENDROW_SSE2
#define HAS_MIRRORROW_SSE2
#endif
// The following are available on Neon platforms
#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_MIRRORROW_NEON
#define HAS_MIRRORROWUV_NEON
#define HAS_SPLITUV_NEON
#define HAS_COPYROW_NEON
#define HAS_I422TOABGRROW_NEON
#define HAS_I422TOARGBROW_NEON
#define HAS_I422TOBGRAROW_NEON
#define HAS_I422TOABGRROW_NEON
#define HAS_I422TORAWROW_NEON
#define HAS_I422TORGB24ROW_NEON
#define HAS_I422TORGBAROW_NEON
#define HAS_YUY2TOUV422ROW_NEON
#define HAS_YUY2TOUVROW_NEON
#define HAS_YUY2TOYROW_NEON
#define HAS_MIRRORROW_NEON
#define HAS_MIRRORROWUV_NEON
#define HAS_SETROW_NEON
#define HAS_SPLITUV_NEON
#define HAS_UYVYTOUV422ROW_NEON
#define HAS_UYVYTOUVROW_NEON
#define HAS_UYVYTOYROW_NEON
#define HAS_YUY2TOUV422ROW_NEON
#define HAS_YUY2TOUVROW_NEON
#define HAS_YUY2TOYROW_NEON
// TODO(fbarchard): Hook these up to calling functions.
#define HAS_ARGBTORGBAROW_NEON
#define HAS_ARGBTORGB24ROW_NEON
#define HAS_ARGBTORAWROW_NEON
#define HAS_ABGRTOARGBROW_NEON
#define HAS_ARGBTORAWROW_NEON
#define HAS_ARGBTORGB24ROW_NEON
#define HAS_ARGBTORGBAROW_NEON
#define HAS_BGRATOARGBROW_NEON
#define HAS_RGBATOARGBROW_NEON
#define HAS_NV12TOARGBROW_NEON
#define HAS_NV21TOARGBROW_NEON
#define HAS_RAWTOARGBROW_NEON
#define HAS_RGB24TOARGBROW_NEON
#define HAS_RGBATOARGBROW_NEON
#endif
#if defined(_MSC_VER) && !defined(__CLR_VER)
@ -189,6 +196,24 @@ void I422ToRGBARow_NEON(const uint8* y_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToRGB24Row_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToRAWRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void NV12ToARGBRow_NEON(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
int width);
void NV21ToARGBRow_NEON(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
int width);
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
@ -237,6 +262,15 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count);
void CopyRow_NEON(const uint8* src, uint8* dst, int count);
void CopyRow_C(const uint8* src, uint8* dst, int count);
void SetRow8_X86(uint8* dst, uint32 v32, int count);
void SetRows32_X86(uint8* dst, uint32 v32, int width,
int dst_stride, int height);
void SetRow8_NEON(uint8* dst, uint32 v32, int count);
void SetRows32_NEON(uint8* dst, uint32 v32, int width,
int dst_stride, int height);
void SetRow8_C(uint8* dst, uint32 v32, int count);
void SetRows32_C(uint8* dst, uint32 v32, int width, int dst_stride, int height);
void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
@ -341,6 +375,16 @@ void I422ToRGBARow_C(const uint8* y_buf,
const uint8* v_buf,
uint8* rgba_buf,
int width);
void I422ToRGB24Row_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb24_buf,
int width);
void I422ToRAWRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* raw_buf,
int width);
void YToARGBRow_C(const uint8* y_buf,
uint8* rgb_buf,
@ -517,30 +561,44 @@ void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void RGBAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void I422ToARGBRow_Any_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToBGRARow_Any_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToABGRRow_Any_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToRGBARow_Any_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToRGB24Row_Any_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToRAWRow_Any_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void NV12ToARGBRow_Any_NEON(const uint8* y_buf,
const uint8* uv_buf,
uint8* argb_buf,
int width);
void NV21ToARGBRow_Any_NEON(const uint8* y_buf,
const uint8* uv_buf,
uint8* argb_buf,
int width);
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
@ -671,4 +729,3 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
#endif // INCLUDE_LIBYUV_ROW_H_ NOLINT

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 395
#define LIBYUV_VERSION 396
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT

View File

@ -64,6 +64,7 @@
# sources.
'source/compare.cc',
'source/compare_neon.cc',
'source/convert.cc',
'source/convert_argb.cc',
'source/convert_from.cc',
@ -79,6 +80,7 @@
'source/row_posix.cc',
'source/row_win.cc',
'source/scale.cc',
'source/scale_neon.cc',
'source/scale_argb.cc',
'source/video_common.cc',
],

View File

@ -244,44 +244,10 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
return seed;
}
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_SUMSQUAREERROR_NEON
static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b,
int count) {
volatile uint32 sse;
asm volatile (
"vmov.u8 q7, #0 \n"
"vmov.u8 q9, #0 \n"
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
"1: \n"
"vld1.u8 {q0}, [%0]! \n"
"vld1.u8 {q1}, [%1]! \n"
"vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n"
"vmlal.s16 q7, d4, d4 \n"
"vmlal.s16 q8, d6, d6 \n"
"vmlal.s16 q8, d5, d5 \n"
"vmlal.s16 q10, d7, d7 \n"
"subs %2, %2, #16 \n"
"bgt 1b \n"
"vadd.u32 q7, q7, q8 \n"
"vadd.u32 q9, q9, q10 \n"
"vadd.u32 q10, q7, q9 \n"
"vpaddl.u32 q1, q10 \n"
"vadd.u64 d0, d2, d3 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
return sse;
}
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_SUMSQUAREERROR_SSE2

62
source/compare_neon.cc Normal file
View File

@ -0,0 +1,62 @@
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile (
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
"vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n"
".p2align 2 \n"
"1: \n"
"vld1.u8 {q0}, [%0]! \n"
"vld1.u8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n"
"vmlal.s16 q8, d4, d4 \n"
"vmlal.s16 q9, d6, d6 \n"
"vmlal.s16 q10, d5, d5 \n"
"vmlal.s16 q11, d7, d7 \n"
"bgt 1b \n"
"vadd.u32 q8, q8, q9 \n"
"vadd.u32 q10, q10, q11 \n"
"vadd.u32 q11, q8, q10 \n"
"vpaddl.u32 q1, q11 \n"
"vadd.u64 d0, d2, d3 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
return sse;
}
#endif // __ARM_NEON__
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@ -62,6 +62,7 @@ int I420Copy(const uint8* src_y, int src_stride_y,
return 0;
}
// Move to row_win etc.
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_HALFROW_SSE2
__declspec(naked) __declspec(align(16))
@ -188,7 +189,7 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
// Blends 32x2 pixels to 16x1
// source in scale.cc
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_SCALEROWDOWN2_NEON
void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
@ -393,7 +394,7 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
// M420 format description:
// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
// Chroma is half width / half height. (420)
// src_stride_m420 is row planar. Normally this will be the width in pixels.
// src_stride_m420 is row planar. Normally this will be the width in pixels.
// The UV plane is half width, but 2 values, so src_stride_m420 applies to
// this as well as the two Y planes.
static int X420ToI420(const uint8* src_y,
@ -592,10 +593,10 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
// This policy assumes that the caller handles the last row of an odd height
// image using C.
// READSAFE_PAGE - enable read ahead within same page.
// A page is 4096 bytes. When reading ahead, if the last pixel is near the
// A page is 4096 bytes. When reading ahead, if the last pixel is near the
// end the page, and a read spans the page into the next page, a memory
// exception can occur if that page has not been allocated, or is a guard
// page. This setting ensures the overread is within the same page.
// page. This setting ensures the overread is within the same page.
// READSAFE_ALWAYS - enables read ahead on systems without memory exceptions
// or where buffers are padded by 64 bytes.
@ -790,7 +791,7 @@ static inline uint32 READWORD(const uint8* p) {
}
#endif
// Must be multiple of 6 pixels. Will over convert to handle remainder.
// Must be multiple of 6 pixels. Will over convert to handle remainder.
// https://developer.apple.com/quicktime/icefloe/dispatch019.html#v210
static void V210ToUYVYRow_C(const uint8* src_v210, uint8* dst_uyvy, int width) {
for (int x = 0; x < width; x += 6) {
@ -820,7 +821,7 @@ static void V210ToUYVYRow_C(const uint8* src_v210, uint8* dst_uyvy, int width) {
}
// Convert V210 to I420.
// V210 is 10 bit version of UYVY. 16 bytes to store 6 pixels.
// V210 is 10 bit version of UYVY. 16 bytes to store 6 pixels.
// With is multiple of 48.
LIBYUV_API
int V210ToI420(const uint8* src_v210, int src_stride_v210,
@ -1611,7 +1612,7 @@ static void JpegI400ToI420(void* opaque,
}
// MJPG (Motion JPeg) to I420
// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
LIBYUV_API
int MJPGToI420(const uint8* sample,
size_t sample_size,
@ -1689,7 +1690,7 @@ int MJPGToI420(const uint8* sample,
ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
} else {
// TODO(fbarchard): Implement conversion for any other colorspace/sample
// factors that occur in practice. 411 is supported by libjpeg
// factors that occur in practice. 411 is supported by libjpeg
// ERROR: Unable to convert MJPEG frame because format is not supported
mjpeg_decoder.UnloadFrame();
return 1;
@ -1734,7 +1735,7 @@ int ConvertToI420(const uint8* sample,
}
int r = 0;
// One pass rotation is available for some formats. For the rest, convert
// One pass rotation is available for some formats. For the rest, convert
// to I420 (with optional vertical flipping) into a temporary I420 buffer,
// and then rotate the I420 to the final destination buffer.
// For in-place conversion, if destination y is same as source sample,

View File

@ -556,6 +556,14 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
}
}
#endif
#if defined(HAS_NV12TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
NV12ToARGBRow = NV12ToARGBRow_NEON;
}
}
#endif
for (int y = 0; y < height; ++y) {
NV12ToARGBRow(src_y, src_uv, dst_argb, width);
@ -571,10 +579,10 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
// Convert NV21 to ARGB.
LIBYUV_API
int NV21ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_vu, int src_stride_vu,
const uint8* src_uv, int src_stride_uv,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (!src_y || !src_vu || !dst_argb ||
if (!src_y || !src_uv || !dst_argb ||
width <= 0 || height == 0) {
return -1;
}
@ -585,7 +593,7 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
dst_stride_argb = -dst_stride_argb;
}
void (*NV21ToARGBRow)(const uint8* y_buf,
const uint8* vu_buf,
const uint8* uv_buf,
uint8* rgb_buf,
int width) = NV21ToARGBRow_C;
#if defined(HAS_NV21TOARGBROW_SSSE3)
@ -599,13 +607,21 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
}
}
#endif
#if defined(HAS_NV21TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
NV21ToARGBRow = NV21ToARGBRow_NEON;
}
}
#endif
for (int y = 0; y < height; ++y) {
NV21ToARGBRow(src_y, src_vu, dst_argb, width);
NV21ToARGBRow(src_y, src_uv, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
src_vu += src_stride_vu;
src_uv += src_stride_uv;
}
}
return 0;
@ -890,7 +906,7 @@ static void JpegI400ToARGB(void* opaque,
}
// MJPG (Motion JPeg) to ARGB
// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
LIBYUV_API
int MJPGToARGB(const uint8* sample,
size_t sample_size,
@ -966,7 +982,7 @@ int MJPGToARGB(const uint8* sample,
ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
} else {
// TODO(fbarchard): Implement conversion for any other colorspace/sample
// factors that occur in practice. 411 is supported by libjpeg
// factors that occur in practice. 411 is supported by libjpeg
// ERROR: Unable to convert MJPEG frame because format is not supported
mjpeg_decoder.UnloadFrame();
return 1;
@ -1004,7 +1020,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
}
int r = 0;
// One pass rotation is available for some formats. For the rest, convert
// One pass rotation is available for some formats. For the rest, convert
// to I420 (with optional vertical flipping) into a temporary I420 buffer,
// and then rotate the I420 to the final destination buffer.
// For in-place conversion, if destination dst_argb is same as source sample,

View File

@ -203,7 +203,7 @@ int I420ToI411(const uint8* src_y, int src_stride_y,
return 0;
}
// Copy to I400. Source can be I420,422,444,400,NV12,NV21
// Copy to I400. Source can be I420,422,444,400,NV12,NV21
LIBYUV_API
int I400Copy(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
@ -895,68 +895,50 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y,
}
// Convert I420 to RGB24.
// TODO(fbarchard): One step I420ToRGB24Row_NEON.
LIBYUV_API
int I420ToRGB24(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
uint8* dst_rgb24, int dst_stride_rgb24,
int width, int height) {
if (!src_y || !src_u || !src_v ||
!dst_argb ||
!dst_rgb24 ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
dst_stride_rgb24 = -dst_stride_rgb24;
}
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToARGBRow_C;
#if defined(HAS_I422TOARGBROW_NEON)
void (*I422ToRGB24Row)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToRGB24Row_C;
#if defined(HAS_I422TORGB24ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
#elif defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
#endif
SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToRGB24Row_C;
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (width * 3 <= kMaxStride) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
}
if (IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
I422ToRGB24Row = I422ToRGB24Row_NEON;
}
}
#endif
#if defined(HAS_ARGBTORGB24ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
if (width * 3 <= kMaxStride) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
}
#elif defined(HAS_I422TORGB24ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
ARGBToRGB24Row = ARGBToRGB24Row_NEON;
I422ToRGB24Row = I422ToRGB24Row_Unaligned_SSSE3;
if (IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
I422ToRGB24Row = I422ToRGB24Row_SSSE3;
}
}
}
#endif
for (int y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, row, width);
ARGBToRGB24Row(row, dst_argb, width);
dst_argb += dst_stride_argb;
I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
dst_rgb24 += dst_stride_rgb24;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
@ -967,67 +949,50 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
}
// Convert I420 to RAW.
// TODO(fbarchard): One step I420ToRAWRow_NEON.
LIBYUV_API
int I420ToRAW(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
uint8* dst_raw, int dst_stride_raw,
int width, int height) {
if (!src_y || !src_u || !src_v ||
!dst_argb ||
!dst_raw ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
dst_raw = dst_raw + (height - 1) * dst_stride_raw;
dst_stride_raw = -dst_stride_raw;
}
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToARGBRow_C;
#if defined(HAS_I422TOARGBROW_NEON)
void (*I422ToRAWRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToRAWRow_C;
#if defined(HAS_I422TORAWROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
#elif defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
#endif
SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToRAWRow_C;
#if defined(HAS_ARGBTORAWROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (width * 3 <= kMaxStride) {
ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
}
if (IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBToRAWRow = ARGBToRAWRow_SSSE3;
I422ToRAWRow = I422ToRAWRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
I422ToRAWRow = I422ToRAWRow_NEON;
}
}
#elif defined(HAS_ARGBTORAWROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
if (width * 3 <= kMaxStride) {
ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
}
#elif defined(HAS_I422TORAWROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
ARGBToRAWRow = ARGBToRAWRow_NEON;
I422ToRAWRow = I422ToRAWRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
I422ToRAWRow = I422ToRAWRow_SSSE3;
}
}
}
#endif
for (int y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, row, width);
ARGBToRAWRow(row, dst_argb, width);
dst_argb += dst_stride_argb;
I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
dst_raw += dst_stride_raw;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;

View File

@ -29,7 +29,7 @@
// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
static __inline void __cpuid(int cpu_info[4], int info_type) {
asm volatile (
asm volatile ( // NOLINT
"mov %%ebx, %%edi \n"
"cpuid \n"
"xchg %%edi, %%ebx \n"
@ -38,7 +38,7 @@ static __inline void __cpuid(int cpu_info[4], int info_type) {
}
#elif defined(__i386__) || defined(__x86_64__)
static __inline void __cpuid(int cpu_info[4], int info_type) {
asm volatile (
asm volatile ( // NOLINT
"cpuid \n"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type));
@ -50,7 +50,7 @@ namespace libyuv {
extern "C" {
#endif
// Low level cpuid for X86. Returns zeros on other CPUs.
// Low level cpuid for X86. Returns zeros on other CPUs.
#if !defined(__CLR_VER) && (defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__))
LIBYUV_API
@ -85,7 +85,7 @@ static uint32 XGetBV(unsigned int xcr) {
#define HAS_XGETBV
static uint32 XGetBV(unsigned int xcr) {
uint32 xcr_feature_mask;
asm volatile (
asm volatile ( // NOLINT
".byte 0x0f, 0x01, 0xd0\n"
: "=a"(xcr_feature_mask)
: "c"(xcr)
@ -124,6 +124,18 @@ int ArmCpuCaps(const char* cpuinfo_name) {
LIBYUV_API
int cpu_info_ = 0;
// Test environment variable for disabling CPU features. Any non-zero value
// to disable. Zero ignored to make it easy to set the variable on/off.
static bool TestEnv(const char* name) {
const char* var = getenv(name);
if (var) {
if (var[0] != '0') {
return true;
}
}
return false;
}
LIBYUV_API
int InitCpuFlags(void) {
#if !defined(__CLR_VER) && defined(CPU_X86)
@ -144,34 +156,33 @@ int InitCpuFlags(void) {
}
}
#endif
// environment variable overrides for testing.
if (getenv("LIBYUV_DISABLE_X86")) {
if (TestEnv("LIBYUV_DISABLE_X86")) {
cpu_info_ &= ~kCpuHasX86;
}
if (getenv("LIBYUV_DISABLE_SSE2")) {
if (TestEnv("LIBYUV_DISABLE_SSE2")) {
cpu_info_ &= ~kCpuHasSSE2;
}
if (getenv("LIBYUV_DISABLE_SSSE3")) {
if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
cpu_info_ &= ~kCpuHasSSSE3;
}
if (getenv("LIBYUV_DISABLE_SSE41")) {
if (TestEnv("LIBYUV_DISABLE_SSE41")) {
cpu_info_ &= ~kCpuHasSSE41;
}
if (getenv("LIBYUV_DISABLE_SSE42")) {
if (TestEnv("LIBYUV_DISABLE_SSE42")) {
cpu_info_ &= ~kCpuHasSSE42;
}
if (getenv("LIBYUV_DISABLE_AVX")) {
if (TestEnv("LIBYUV_DISABLE_AVX")) {
cpu_info_ &= ~kCpuHasAVX;
}
if (getenv("LIBYUV_DISABLE_AVX2")) {
if (TestEnv("LIBYUV_DISABLE_AVX2")) {
cpu_info_ &= ~kCpuHasAVX2;
}
if (getenv("LIBYUV_DISABLE_ASM")) {
if (TestEnv("LIBYUV_DISABLE_ASM")) {
cpu_info_ = kCpuInitialized;
}
#elif defined(__arm__)
#if defined(__linux__) && defined(__ARM_NEON__)
#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
// linux arm parse text file for neon detect.
cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
#elif defined(__ARM_NEON__)
@ -181,10 +192,10 @@ int InitCpuFlags(void) {
cpu_info_ = kCpuHasNEON;
#endif
cpu_info_ |= kCpuInitialized | kCpuHasARM;
if (getenv("LIBYUV_DISABLE_NEON")) {
if (TestEnv("LIBYUV_DISABLE_NEON")) {
cpu_info_ &= ~kCpuHasNEON;
}
if (getenv("LIBYUV_DISABLE_ASM")) {
if (TestEnv("LIBYUV_DISABLE_ASM")) {
cpu_info_ = kCpuInitialized;
}
#endif // __arm__

View File

@ -21,7 +21,7 @@ extern "C" {
#endif
// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
// and vst would select which 2 components to write. The low level would need
// and vst would select which 2 components to write. The low level would need
// to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)

View File

@ -10,6 +10,7 @@
#include "libyuv/mjpeg_decoder.h"
#ifdef HAVE_JPEG
// Must be included before jpeglib
#include <assert.h>
#ifndef __CLR_VER
@ -80,7 +81,7 @@ MJpegDecoder::~MJpegDecoder() {
}
// Helper function to validate the jpeg looks ok.
// TODO(fbarchard): Improve performance. Scan backward for EOI?
// TODO(fbarchard): Improve performance. Scan backward for EOI?
bool ValidateJpeg(const uint8* sample, size_t sample_size) {
if (sample_size < 64) {
// ERROR: Invalid jpeg size: sample_size
@ -105,7 +106,7 @@ bool ValidateJpeg(const uint8* sample, size_t sample_size) {
}
}
if (!total_eoi) {
// ERROR: Invalid jpeg end code not found. Size sample_size
// ERROR: Invalid jpeg end code not found. Size sample_size
return false;
}
return true;
@ -578,3 +579,5 @@ JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
}
} // namespace libyuv
#endif // HAVE_JPEG

View File

@ -105,6 +105,130 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
}
}
// Convert YUY2 to I422.
LIBYUV_API
int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
src_stride_yuy2 = -src_stride_yuy2;
}
void (*YUY2ToUV422Row)(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
void (*YUY2ToYRow)(const uint8* src_yuy2,
uint8* dst_y, int pix);
YUY2ToYRow = YUY2ToYRow_C;
YUY2ToUV422Row = YUY2ToUV422Row_C;
#if defined(HAS_YUY2TOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
if (width > 16) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
YUY2ToYRow = YUY2ToYRow_Any_SSE2;
}
if (IS_ALIGNED(width, 16)) {
YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
YUY2ToYRow = YUY2ToYRow_SSE2;
}
}
}
}
#elif defined(HAS_YUY2TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
if (width > 8) {
YUY2ToYRow = YUY2ToYRow_Any_NEON;
if (width > 16) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
}
}
if (IS_ALIGNED(width, 16)) {
YUY2ToYRow = YUY2ToYRow_NEON;
YUY2ToUV422Row = YUY2ToUV422Row_NEON;
}
}
#endif
for (int y = 0; y < height; ++y) {
YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
YUY2ToYRow(src_yuy2, dst_y, width);
src_yuy2 += src_stride_yuy2;
dst_y += dst_stride_y;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
return 0;
}
// Convert UYVY to I422.
LIBYUV_API
int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
src_stride_uyvy = -src_stride_uyvy;
}
void (*UYVYToUV422Row)(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
void (*UYVYToYRow)(const uint8* src_uyvy,
uint8* dst_y, int pix);
UYVYToYRow = UYVYToYRow_C;
UYVYToUV422Row = UYVYToUV422Row_C;
#if defined(HAS_UYVYTOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
if (width > 16) {
UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
UYVYToYRow = UYVYToYRow_Any_SSE2;
}
if (IS_ALIGNED(width, 16)) {
UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
UYVYToUV422Row = UYVYToUV422Row_SSE2;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
UYVYToYRow = UYVYToYRow_SSE2;
}
}
}
}
#elif defined(HAS_UYVYTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
if (width > 8) {
UYVYToYRow = UYVYToYRow_Any_NEON;
if (width > 16) {
UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
}
}
if (IS_ALIGNED(width, 16)) {
UYVYToYRow = UYVYToYRow_NEON;
UYVYToUV422Row = UYVYToUV422Row_NEON;
}
}
#endif
for (int y = 0; y < height; ++y) {
UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
UYVYToYRow(src_uyvy, dst_y, width);
src_uyvy += src_stride_uyvy;
dst_y += dst_stride_y;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
return 0;
}
// Mirror I420 with optional flipping
LIBYUV_API
int I420Mirror(const uint8* src_y, int src_stride_y,
@ -721,6 +845,11 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
NV12ToARGBRow = NV12ToARGBRow_SSSE3;
}
#endif
#if defined(HAS_NV12TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width * 4 <= kMaxStride) {
NV12ToARGBRow = NV12ToARGBRow_NEON;
}
#endif
SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
@ -789,129 +918,6 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y,
return 0;
}
// SetRow8 writes 'count' bytes using a 32 bit value repeated
// SetRow32 writes 'count' words using a 32 bit value repeated
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#define HAS_SETROW_NEON
static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
asm volatile ( // NOLINT
"vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
"vst1.u32 {q0}, [%0]! \n" // store
"bgt 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
: "r"(v32) // %2
: "q0", "memory", "cc");
}
// TODO(fbarchard): Make fully assembler
static void SetRows32_NEON(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
for (int y = 0; y < height; ++y) {
SetRow8_NEON(dst, v32, width << 2);
dst += dst_stride;
}
}
#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_SETROW_X86
__declspec(naked) __declspec(align(16))
static void SetRow8_X86(uint8* dst, uint32 v32, int count) {
__asm {
mov edx, edi
mov edi, [esp + 4] // dst
mov eax, [esp + 8] // v32
mov ecx, [esp + 12] // count
shr ecx, 2
rep stosd
mov edi, edx
ret
}
}
__declspec(naked) __declspec(align(16))
static void SetRows32_X86(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
__asm {
push esi
push edi
push ebp
mov edi, [esp + 12 + 4] // dst
mov eax, [esp + 12 + 8] // v32
mov ebp, [esp + 12 + 12] // width
mov edx, [esp + 12 + 16] // dst_stride
mov esi, [esp + 12 + 20] // height
lea ecx, [ebp * 4]
sub edx, ecx // stride - width * 4
align 16
convertloop:
mov ecx, ebp
rep stosd
add edi, edx
sub esi, 1
jg convertloop
pop ebp
pop edi
pop esi
ret
}
}
#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
#define HAS_SETROW_X86
static void SetRow8_X86(uint8* dst, uint32 v32, int width) {
size_t width_tmp = static_cast<size_t>(width);
asm volatile ( // NOLINT
"shr $0x2,%1 \n"
"rep stosl \n"
: "+D"(dst), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
: "memory", "cc");
}
static void SetRows32_X86(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
for (int y = 0; y < height; ++y) {
size_t width_tmp = static_cast<size_t>(width);
uint32* d = reinterpret_cast<uint32*>(dst);
asm volatile ( // NOLINT
"rep stosl \n"
: "+D"(d), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
: "memory", "cc");
dst += dst_stride;
}
}
#endif
static void SetRow8_C(uint8* dst, uint32 v8, int count) {
#ifdef _MSC_VER
for (int x = 0; x < count; ++x) {
dst[x] = v8;
}
#else
memset(dst, v8, count);
#endif
}
static void SetRows32_C(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
for (int y = 0; y < height; ++y) {
uint32* d = reinterpret_cast<uint32*>(dst);
for (int x = 0; x < width; ++x) {
d[x] = v32;
}
dst += dst_stride;
}
}
LIBYUV_API
void SetPlane(uint8* dst_y, int dst_stride_y,
int width, int height,
@ -929,13 +935,6 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
SetRow = SetRow8_X86;
}
#endif
#if defined(HAS_SETROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
SetRow = SetRow8_SSE2;
}
#endif
uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
// Set plane
@ -1242,7 +1241,7 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
}
// Computes table of cumulative sum for image where the value is the sum
// of all values above and to the left of the entry. Used by ARGBBlur.
// of all values above and to the left of the entry. Used by ARGBBlur.
LIBYUV_API
int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
int32* dst_cumsum, int dst_stride32_cumsum,
@ -1270,7 +1269,7 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
// Blur ARGB image.
// Caller should allocate CumulativeSum table of width * height * 16 bytes
// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
// as the buffer is treated as circular.
LIBYUV_API
int ARGBBlur(const uint8* src_argb, int src_stride_argb,
@ -1290,7 +1289,7 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
CumulativeSumToAverage = CumulativeSumToAverage_SSE2;
}
#endif
// Compute enough CumulativeSum for first row to be blurred. After this
// Compute enough CumulativeSum for first row to be blurred. After this
// one row of CumulativeSum is updated at a time.
ARGBComputeCumulativeSum(src_argb, src_stride_argb,
dst_cumsum, dst_stride32_cumsum,

View File

@ -814,7 +814,7 @@ void RotatePlane90(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
// Rotate by 90 is a transpose with the source read
// from bottom to top. So set the source pointer to the end
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
src += src_stride * (height - 1);
src_stride = -src_stride;
@ -826,7 +826,7 @@ void RotatePlane270(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
// Rotate by 270 is a transpose with the destination written
// from bottom to top. So set the destination pointer to the end
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
dst += dst_stride * (width - 1);
dst_stride = -dst_stride;
@ -880,7 +880,7 @@ void RotatePlane180(const uint8* src, int src_stride,
if (width > kMaxStride) {
return;
}
// Swap first and last row and mirror the content. Uses a temporary row.
// Swap first and last row and mirror the content. Uses a temporary row.
SIMD_ALIGNED(uint8 row[kMaxStride]);
const uint8* src_bot = src + src_stride * (height - 1);
uint8* dst_bot = dst + dst_stride * (height - 1);

View File

@ -58,7 +58,7 @@ void ARGBRotate90(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
// Rotate by 90 is a ARGBTranspose with the source read
// from bottom to top. So set the source pointer to the end
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
src += src_stride * (height - 1);
src_stride = -src_stride;
@ -69,7 +69,7 @@ void ARGBRotate270(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
// Rotate by 270 is a ARGBTranspose with the destination written
// from bottom to top. So set the destination pointer to the end
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
dst += dst_stride * (width - 1);
dst_stride = -dst_stride;
@ -109,7 +109,7 @@ void ARGBRotate180(const uint8* src, int src_stride,
if (width * 4 > kMaxStride) {
return;
}
// Swap first and last row and mirror the content. Uses a temporary row.
// Swap first and last row and mirror the content. Uses a temporary row.
SIMD_ALIGNED(uint8 row[kMaxStride]);
const uint8* src_bot = src + src_stride * (height - 1);
uint8* dst_bot = dst + dst_stride * (height - 1);

View File

@ -26,12 +26,12 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width) {
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
// handle 8x8 blocks. this should be the majority of the plane
".p2align 4 \n"
"1: \n"
"mov r9, %0 \n"
@ -81,7 +81,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"subs %4, #8 \n" // w -= 8
"bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %4, #8 \n"
"beq 4f \n"
@ -193,12 +193,12 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_b, int dst_stride_b,
int width) {
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %6, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
// handle 8x8 blocks. this should be the majority of the plane
".p2align 4 \n"
"1: \n"
"mov r9, %0 \n"
@ -264,7 +264,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"subs %6, #8 \n" // w -= 8
"bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %6, #8 \n"
"beq 4f \n"

View File

@ -330,7 +330,7 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) {
int sb = (b * 17 + g * 68 + r * 35) >> 7;
int sg = (b * 22 + g * 88 + r * 45) >> 7;
int sr = (b * 24 + g * 98 + r * 50) >> 7;
// b does not over flow. a is preserved from original.
// b does not over flow. a is preserved from original.
if (sg > 255) {
sg = 255;
}
@ -344,7 +344,7 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) {
}
}
// Apply color matrix to a row of image. Matrix is signed.
// Apply color matrix to a row of image. Matrix is signed.
void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) {
for (int x = 0; x < width; ++x) {
int b = dst_argb[0];
@ -459,6 +459,14 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf,
(255u << ashift);
}
static __inline void YuvPixel2(uint8 y, uint8 u, uint8 v,
uint8* b, uint8* g, uint8* r) {
int32 y1 = (static_cast<int32>(y) - 16) * YG;
*b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
*g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
*r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
}
void I444ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@ -492,6 +500,48 @@ void I422ToARGBRow_C(const uint8* y_buf,
}
}
void I422ToRGB24Row_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width - 1; x += 2) {
YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
y_buf += 2;
u_buf += 1;
v_buf += 1;
rgb_buf += 6; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
}
}
void I422ToRAWRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width - 1; x += 2) {
YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
y_buf += 2;
u_buf += 1;
v_buf += 1;
rgb_buf += 6; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
}
}
void I411ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@ -671,6 +721,28 @@ void CopyRow_C(const uint8* src, uint8* dst, int count) {
memcpy(dst, src, count);
}
void SetRow8_C(uint8* dst, uint32 v8, int count) {
#ifdef _MSC_VER
// VC will generate rep stosb.
for (int x = 0; x < count; ++x) {
dst[x] = v8;
}
#else
memset(dst, v8, count);
#endif
}
void SetRows32_C(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
for (int y = 0; y < height; ++y) {
uint32* d = reinterpret_cast<uint32*>(dst);
for (int x = 0; x < width; ++x) {
d[x] = v32;
}
dst += dst_stride;
}
}
// Filter 2 rows of YUY2 UV's (422) into U and V (420).
void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_u, uint8* dst_v, int width) {
@ -950,6 +1022,11 @@ Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0)
YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
#endif
#ifdef HAS_I422TORGB24ROW_SSSE3
YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_Unaligned_SSSE3, \
I422ToRGB24Row_C, 1)
YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_Unaligned_SSSE3, I422ToRAWRow_C, 1)
#endif
#ifdef HAS_I422TORGBAROW_SSSE3
YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1)
#endif
@ -958,6 +1035,10 @@ YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1)
YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1)
YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1)
YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1)
Y2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0)
Y2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0)
YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1)
YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1)
#endif
#undef YANY

View File

@ -101,8 +101,8 @@ void I422ToARGBRow_NEON(const uint8* y_buf,
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15"
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif // HAS_I422TOARGBROW_NEON
@ -135,8 +135,8 @@ void I422ToBGRARow_NEON(const uint8* y_buf,
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15"
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif // HAS_I422TOBGRAROW_NEON
@ -169,8 +169,8 @@ void I422ToABGRRow_NEON(const uint8* y_buf,
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15"
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif // HAS_I422TOABGRROW_NEON
@ -202,12 +202,77 @@ void I422ToRGBARow_NEON(const uint8* y_buf,
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15"
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif // HAS_I422TORGBAROW_NEON
#ifdef HAS_I422TORGB24ROW_NEON
void I422ToRGB24Row_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
".p2align 2 \n"
"1: \n"
READYUV422
YUV422TORGB
"subs %4, %4, #8 \n"
"vst3.8 {d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif // HAS_I422TORGB24ROW_NEON
#ifdef HAS_I422TORAWROW_NEON
void I422ToRAWRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
".p2align 2 \n"
"1: \n"
READYUV422
YUV422TORGB
"subs %4, %4, #8 \n"
"vswp.u8 d20, d22 \n"
"vst3.8 {d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif // HAS_I422TORAWROW_NEON
#ifdef HAS_NV12TOARGBROW_NEON
void NV12ToARGBRow_NEON(const uint8* y_buf,
const uint8* uv_buf,
@ -233,8 +298,8 @@ void NV12ToARGBRow_NEON(const uint8* y_buf,
"+r"(width) // %3
: "r"(&kUVToRB), // %4
"r"(&kUVToG) // %5
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15"
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif // HAS_NV12TOARGBROW_NEON
@ -264,8 +329,8 @@ void NV21ToARGBRow_NEON(const uint8* y_buf,
"+r"(width) // %3
: "r"(&kUVToRB), // %4
"r"(&kUVToG) // %5
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15"
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif // HAS_NV21TOARGBROW_NEON
@ -312,7 +377,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
#endif // HAS_COPYROW_NEON
#ifdef HAS_SETROW_NEON
// SetRow8 writes 'count' bytes using a 32 bit value repeated
// SetRow8 writes 'count' bytes using a 32 bit value repeated.
void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
asm volatile ( // NOLINT
"vdup.u32 q0, %2 \n" // duplicate 4 ints
@ -327,7 +392,7 @@ void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
}
// TODO(fbarchard): Make fully assembler
// SetRow32 writes 'count' words using a 32 bit value repeated
// SetRow32 writes 'count' words using a 32 bit value repeated.
void SetRows32_NEON(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
for (int y = 0; y < height; ++y) {
@ -344,11 +409,11 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"add %1, %2 \n"
// work on segments that are multiples of 16
"lsrs r3, %2, #4 \n"
// the output is written in two block. 8 bytes followed
// by another 8. reading is done sequentially, from left to
// right. writing is done from right to left in block sizes
// the output is written in two block. 8 bytes followed
// by another 8. reading is done sequentially, from left to
// right. writing is done from right to left in block sizes
// %1, the destination pointer is incremented after writing
// the first of the two blocks. need to subtract that 8 off
// the first of the two blocks. need to subtract that 8 off
// along with 16 to get the next location.
"mov r3, #-24 \n"
"beq 2f \n"
@ -356,9 +421,9 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
// back of destination by the size of the register that is
// going to be mirrored
"sub %1, #16 \n"
// the loop needs to run on blocks of 16. what will be left
// the loop needs to run on blocks of 16. what will be left
// over is either a negative number, the residuals that need
// to be done, or 0. if this isn't subtracted off here the
// to be done, or 0. If this isn't subtracted off here the
// loop will run one extra time.
"sub %2, #16 \n"
@ -375,7 +440,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"vst1.8 {d0}, [%1], r3 \n" // dst -= 16
"bge 1b \n"
// add 16 back to the counter. if the result is 0 there is no
// add 16 back to the counter. if the result is 0 there is no
// residuals so jump past
"adds %2, #16 \n"
"beq 5f \n"
@ -430,9 +495,9 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
// going to be mirrord
"sub %1, #8 \n"
"sub %2, #8 \n"
// the loop needs to run on blocks of 8. what will be left
// the loop needs to run on blocks of 8. what will be left
// over is either a negative number, the residuals that need
// to be done, or 0. if this isn't subtracted off here the
// to be done, or 0. if this isn't subtracted off here the
// loop will run one extra time.
"sub %3, #8 \n"
@ -446,7 +511,7 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
"vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8
"bge 1b \n"
// add 8 back to the counter. if the result is 0 there is no
// add 8 back to the counter. if the result is 0 there is no
// residuals so return
"adds %3, #8 \n"
"beq 4f \n"

View File

@ -741,9 +741,9 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
}
// TODO(fbarchard): pass xmm constants to single block of assembly.
// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
// and considered unsafe.
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
@ -2143,6 +2143,34 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) {
}
#endif // HAS_COPYROW_X86
#ifdef HAS_SETROW_X86
void SetRow8_X86(uint8* dst, uint32 v32, int width) {
size_t width_tmp = static_cast<size_t>(width);
asm volatile (
"shr $0x2,%1 \n"
"rep stosl \n"
: "+D"(dst), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
: "memory", "cc");
}
void SetRows32_X86(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
for (int y = 0; y < height; ++y) {
size_t width_tmp = static_cast<size_t>(width);
uint32* d = reinterpret_cast<uint32*>(dst);
asm volatile (
"rep stosl \n"
: "+D"(d), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
: "memory", "cc");
dst += dst_stride;
}
}
#endif // HAS_SETROW_X86
#ifdef HAS_YUY2TOYROW_SSE2
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile (
@ -2998,7 +3026,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
#endif // HAS_ARGBUNATTENUATEROW_SSE2
#ifdef HAS_ARGBGRAYROW_SSSE3
// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
CONST vec8 kARGBToGray = {
14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
};
@ -3455,7 +3483,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
// Copy ARGB pixels from source image with slope to a row of destination.
// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
// an error if movq is used. movd %%xmm0,%1
// an error if movq is used. movd %%xmm0,%1
LIBYUV_API
void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,

View File

@ -18,6 +18,7 @@ extern "C" {
// This module is for Visual C x86.
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
// TODO(fbarchard): I420ToRGB24, I420ToRAW
#ifdef HAS_ARGBTOYROW_SSSE3
// Constants for ARGB.
@ -2521,6 +2522,54 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) {
}
#endif // HAS_COPYROW_X86
#ifdef HAS_SETROW_X86
// SetRow8 writes 'count' bytes using a 32 bit value repeated.
__declspec(naked) __declspec(align(16))
void SetRow8_X86(uint8* dst, uint32 v32, int count) {
__asm {
mov edx, edi
mov edi, [esp + 4] // dst
mov eax, [esp + 8] // v32
mov ecx, [esp + 12] // count
shr ecx, 2
rep stosd
mov edi, edx
ret
}
}
// SetRow32 writes 'count' words using a 32 bit value repeated.
__declspec(naked) __declspec(align(16))
void SetRows32_X86(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
__asm {
push esi
push edi
push ebp
mov edi, [esp + 12 + 4] // dst
mov eax, [esp + 12 + 8] // v32
mov ebp, [esp + 12 + 12] // width
mov edx, [esp + 12 + 16] // dst_stride
mov esi, [esp + 12 + 20] // height
lea ecx, [ebp * 4]
sub edx, ecx // stride - width * 4
align 16
convertloop:
mov ecx, ebp
rep stosd
add edi, edx
sub esi, 1
jg convertloop
pop ebp
pop edi
pop esi
ret
}
}
#endif // HAS_SETROW_X86
#ifdef HAS_YUY2TOYROW_SSE2
__declspec(naked) __declspec(align(16))
void YUY2ToYRow_SSE2(const uint8* src_yuy2,
@ -3497,7 +3546,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
// Same as Sepia except matrix is provided.
// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
__declspec(naked) __declspec(align(16))
void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,

View File

@ -54,514 +54,49 @@ void SetUseReferenceImpl(bool use) {
#define HAS_SCALEROWDOWN2_NEON
// Note - not static due to reuse in convert for 444 to 420.
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) {
asm volatile (
"1: \n"
// load even pixels into q0, odd into q1
"vld2.u8 {q0,q1}, [%0]! \n"
"vst1.u8 {q0}, [%1]! \n" // store even pixels
"subs %2, %2, #16 \n" // 16 processed per loop
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
:
: "q0", "q1" // Clobber List
);
}
uint8* dst, int dst_width);
void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %0 \n"
"1: \n"
"vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post inc
"vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post inc
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent
"vpaddl.u8 q1, q1 \n"
// row 2 add adjacent, add row 1 to row 2
"vpadal.u8 q0, q2 \n"
"vpadal.u8 q1, q3 \n"
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #2 \n"
"vst1.u8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n" // 16 processed per loop
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
"+r"(dst_width) // %3
:
: "q0", "q1", "q2", "q3" // Clobber List
);
}
uint8* dst, int dst_width);
#define HAS_SCALEROWDOWN4_NEON
static void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) {
asm volatile (
"1: \n"
"vld2.u8 {d0, d1}, [%0]! \n"
"vtrn.u8 d1, d0 \n"
"vshrn.u16 d0, q0, #8 \n"
"vst1.u32 {d0[1]}, [%1]! \n"
"subs %2, #4 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "q0", "q1", "memory", "cc"
);
}
static void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"add r4, %0, %3 \n"
"add r5, r4, %3 \n"
"add %3, r5, %3 \n"
"1: \n"
"vld1.u8 {q0}, [%0]! \n" // load up 16x4
"vld1.u8 {q1}, [r4]! \n"
"vld1.u8 {q2}, [r5]! \n"
"vld1.u8 {q3}, [%3]! \n"
"vpaddl.u8 q0, q0 \n"
"vpadal.u8 q0, q1 \n"
"vpadal.u8 q0, q2 \n"
"vpadal.u8 q0, q3 \n"
"vpaddl.u16 q0, q0 \n"
"vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
"vmovn.u16 d0, q0 \n"
"vst1.u32 {d0[0]}, [%1]! \n"
"subs %2, #4 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride) // %3
: "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
);
}
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
#define HAS_SCALEROWDOWN34_NEON
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
// to load up the every 4th pixel into a 4 different registers.
// Point samples 32 pixels to 24 pixels.
static void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) {
asm volatile (
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vmov d2, d3 \n" // order d0, d1, d2
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"subs %2, #24 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "d0", "d1", "d2", "d3", "memory", "cc"
);
}
static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
// when adding lines together
"vmovl.u8 q8, d4 \n"
"vmovl.u8 q9, d5 \n"
"vmovl.u8 q10, d6 \n"
"vmovl.u8 q11, d7 \n"
// 3 * line_0 + line_1
"vmlal.u8 q8, d0, d24 \n"
"vmlal.u8 q9, d1, d24 \n"
"vmlal.u8 q10, d2, d24 \n"
"vmlal.u8 q11, d3, d24 \n"
// (3 * line_0 + line_1) >> 2
"vqrshrn.u16 d0, q8, #2 \n"
"vqrshrn.u16 d1, q9, #2 \n"
"vqrshrn.u16 d2, q10, #2 \n"
"vqrshrn.u16 d3, q11, #2 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
"vmovl.u8 q8, d1 \n"
"vmlal.u8 q8, d0, d24 \n"
"vqrshrn.u16 d0, q8, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
"vmovl.u8 q8, d2 \n"
"vmlal.u8 q8, d3, d24 \n"
"vqrshrn.u16 d2, q8, #2 \n"
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"subs %2, #24 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
:
: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
);
}
static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
// average src line 0 with src line 1
"vrhadd.u8 q0, q0, q2 \n"
"vrhadd.u8 q1, q1, q3 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
"vmovl.u8 q3, d1 \n"
"vmlal.u8 q3, d0, d24 \n"
"vqrshrn.u16 d0, q3, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
"vmovl.u8 q3, d2 \n"
"vmlal.u8 q3, d3, d24 \n"
"vqrshrn.u16 d2, q3, #2 \n"
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"subs %2, #24 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
:
: "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
);
}
void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
#define HAS_SCALEROWDOWN38_NEON
const uvec8 kShuf38 =
{ 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
const uvec8 kShuf38_2 =
{ 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
const vec16 kMult38_Div6 =
{ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
const vec16 kMult38_Div9 =
{ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
// 32 -> 12
static void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vld1.u8 {q3}, [%3] \n"
"1: \n"
"vld1.u8 {d0, d1, d2, d3}, [%0]! \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
"vst1.u8 {d4}, [%1]! \n"
"vst1.u32 {d5[0]}, [%1]! \n"
"subs %2, #12 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(&kShuf38) // %3
: "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
);
}
void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width);
// 32x3 -> 12x1
static void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vld1.u16 {q13}, [%4] \n"
"vld1.u8 {q14}, [%5] \n"
"vld1.u8 {q15}, [%6] \n"
"add r4, %0, %3, lsl #1 \n"
"add %3, %0 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
"vld4.u8 {d16, d17, d18, d19}, [r4]! \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
"vtrn.u8 d0, d1 \n"
"vtrn.u8 d4, d5 \n"
"vtrn.u8 d16, d17 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
"vtrn.u8 d2, d3 \n"
"vtrn.u8 d6, d7 \n"
"vtrn.u8 d18, d19 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
"vpaddl.u8 q0, q0 \n"
"vpaddl.u8 q2, q2 \n"
"vpaddl.u8 q8, q8 \n"
// d3 = 60+70 61+71 62+72 63+73
"vpaddl.u8 d3, d3 \n"
"vpaddl.u8 d7, d7 \n"
"vpaddl.u8 d19, d19 \n"
// combine source lines
"vadd.u16 q0, q2 \n"
"vadd.u16 q0, q8 \n"
"vadd.u16 d4, d3, d7 \n"
"vadd.u16 d4, d19 \n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
// + s[6 + st * 1] + s[7 + st * 1]
// + s[6 + st * 2] + s[7 + st * 2]) / 6
"vqrdmulh.s16 q2, q2, q13 \n"
"vmovn.u16 d4, q2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
// requires expanding from u8 to u16 as the 0,1 and 4,5
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
"vmovl.u8 q1, d2 \n"
"vmovl.u8 q3, d6 \n"
"vmovl.u8 q9, d18 \n"
// combine source lines
"vadd.u16 q1, q3 \n"
"vadd.u16 q1, q9 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
"vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
"vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
"vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
"vqrdmulh.s16 q0, q0, q15 \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
"vmov.u8 d2, d4 \n"
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
"vst1.u8 {d3}, [%1]! \n"
"vst1.u32 {d4[0]}, [%1]! \n"
"subs %2, #12 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
: "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2), // %5
"r"(&kMult38_Div9) // %6
: "r4", "q0", "q1", "q2", "q3", "q8", "q9",
"q13", "q14", "q15", "memory", "cc"
);
}
void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
// 32x2 -> 12x1
static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vld1.u16 {q13}, [%4] \n"
"vld1.u8 {q14}, [%5] \n"
"add %3, %0 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
"vtrn.u8 d0, d1 \n"
"vtrn.u8 d4, d5 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
"vtrn.u8 d2, d3 \n"
"vtrn.u8 d6, d7 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
"vpaddl.u8 q0, q0 \n"
"vpaddl.u8 q2, q2 \n"
// d3 = 60+70 61+71 62+72 63+73
"vpaddl.u8 d3, d3 \n"
"vpaddl.u8 d7, d7 \n"
// combine source lines
"vadd.u16 q0, q2 \n"
"vadd.u16 d4, d3, d7 \n"
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
"vqrshrn.u16 d4, q2, #2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
// requires expanding from u8 to u16 as the 0,1 and 4,5
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
"vmovl.u8 q1, d2 \n"
"vmovl.u8 q3, d6 \n"
// combine source lines
"vadd.u16 q1, q3 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
"vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
"vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
"vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
"vqrdmulh.s16 q0, q0, q13 \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
"vmov.u8 d2, d4 \n"
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
"vst1.u8 {d3}, [%1]! \n"
"vst1.u32 {d4[0]}, [%1]! \n"
"subs %2, #12 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
: "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2) // %5
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
);
}
void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
// 16x2 -> 16x1
#define HAS_SCALEFILTERROWS_NEON
static void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
asm volatile (
"cmp %4, #0 \n"
"beq 2f \n"
"add %2, %1 \n"
"cmp %4, #128 \n"
"beq 3f \n"
"vdup.8 d5, %4 \n"
"rsb %4, #256 \n"
"vdup.8 d4, %4 \n"
"1: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n"
"vmull.u8 q13, d0, d4 \n"
"vmull.u8 q14, d1, d4 \n"
"vmlal.u8 q13, d2, d5 \n"
"vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 1b \n"
"b 4f \n"
"2: \n"
"vld1.u8 {q0}, [%1]! \n"
"subs %3, #16 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 2b \n"
"b 4f \n"
"3: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 3b \n"
"4: \n"
"vst1.u8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
"+r"(dst_width), // %3
"+r"(source_y_fraction) // %4
:
: "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
);
}
void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction);
/**
* SSE2 downscalers with interpolation.
@ -1010,7 +545,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
#define HAS_SCALEROWDOWN34_SSSE3
// Point samples 32 pixels to 24 pixels.
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
// Then shuffled to do the scaling.
// Note that movdqa+palign may be better than movdqu.
@ -1049,7 +584,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Blends 32x2 rectangle to 24x1
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
// Then shuffled to do the scaling.
// Register usage:
@ -3420,7 +2955,7 @@ int I420Scale(const uint8* src_y, int src_stride_y,
dst_halfwidth = dst_width >> 1;
}
// If caller used height / 2 when computing src_v, it will point into what
// should be the src_u plane. Detect this and reduce halfheight to match.
// should be the src_u plane. Detect this and reduce halfheight to match.
int uv_src_plane_size = src_halfwidth * src_halfheight;
if ((src_height & 1) &&
(src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
@ -3484,7 +3019,7 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
dst_halfwidth = dst_width >> 1;
}
// If caller used height / 2 when computing src_v, it will point into what
// should be the src_u plane. Detect this and reduce halfheight to match.
// should be the src_u plane. Detect this and reduce halfheight to match.
int uv_src_plane_size = src_halfwidth * src_halfheight;
if ((src_height & 1) &&
(src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {

534
source/scale_neon.cc Normal file
View File

@ -0,0 +1,534 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/basic_types.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for GCC Neon
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
/**
* NEON downscalers with interpolation.
*
* Provided by Fritz Koenig
*
*/
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) {
asm volatile (
"1: \n"
// load even pixels into q0, odd into q1
"vld2.u8 {q0,q1}, [%0]! \n"
"vst1.u8 {q0}, [%1]! \n" // store even pixels
"subs %2, %2, #16 \n" // 16 processed per loop
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
:
: "q0", "q1" // Clobber List
);
}
void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %0 \n"
"1: \n"
"vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post inc
"vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post inc
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent
"vpaddl.u8 q1, q1 \n"
"vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
"vpadal.u8 q1, q3 \n"
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #2 \n"
"vst1.u8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n" // 16 processed per loop
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
"+r"(dst_width) // %3
:
: "q0", "q1", "q2", "q3" // Clobber List
);
}
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) {
asm volatile (
"1: \n"
"vld2.u8 {d0, d1}, [%0]! \n"
"vtrn.u8 d1, d0 \n"
"vshrn.u16 d0, q0, #8 \n"
"vst1.u32 {d0[1]}, [%1]! \n"
"subs %2, #4 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "q0", "q1", "memory", "cc"
);
}
void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"add r4, %0, %3 \n"
"add r5, r4, %3 \n"
"add %3, r5, %3 \n"
"1: \n"
"vld1.u8 {q0}, [%0]! \n" // load up 16x4
"vld1.u8 {q1}, [r4]! \n"
"vld1.u8 {q2}, [r5]! \n"
"vld1.u8 {q3}, [%3]! \n"
"vpaddl.u8 q0, q0 \n"
"vpadal.u8 q0, q1 \n"
"vpadal.u8 q0, q2 \n"
"vpadal.u8 q0, q3 \n"
"vpaddl.u16 q0, q0 \n"
"vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
"vmovn.u16 d0, q0 \n"
"vst1.u32 {d0[0]}, [%1]! \n"
"subs %2, #4 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride) // %3
: "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
);
}
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
// to load up the every 4th pixel into a 4 different registers.
// Point samples 32 pixels to 24 pixels.
void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) {
asm volatile (
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vmov d2, d3 \n" // order d0, d1, d2
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"subs %2, #24 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "d0", "d1", "d2", "d3", "memory", "cc"
);
}
void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
// when adding lines together
"vmovl.u8 q8, d4 \n"
"vmovl.u8 q9, d5 \n"
"vmovl.u8 q10, d6 \n"
"vmovl.u8 q11, d7 \n"
// 3 * line_0 + line_1
"vmlal.u8 q8, d0, d24 \n"
"vmlal.u8 q9, d1, d24 \n"
"vmlal.u8 q10, d2, d24 \n"
"vmlal.u8 q11, d3, d24 \n"
// (3 * line_0 + line_1) >> 2
"vqrshrn.u16 d0, q8, #2 \n"
"vqrshrn.u16 d1, q9, #2 \n"
"vqrshrn.u16 d2, q10, #2 \n"
"vqrshrn.u16 d3, q11, #2 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
"vmovl.u8 q8, d1 \n"
"vmlal.u8 q8, d0, d24 \n"
"vqrshrn.u16 d0, q8, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
"vmovl.u8 q8, d2 \n"
"vmlal.u8 q8, d3, d24 \n"
"vqrshrn.u16 d2, q8, #2 \n"
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"subs %2, #24 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
:
: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
);
}
void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
// average src line 0 with src line 1
"vrhadd.u8 q0, q0, q2 \n"
"vrhadd.u8 q1, q1, q3 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
"vmovl.u8 q3, d1 \n"
"vmlal.u8 q3, d0, d24 \n"
"vqrshrn.u16 d0, q3, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
"vmovl.u8 q3, d2 \n"
"vmlal.u8 q3, d3, d24 \n"
"vqrshrn.u16 d2, q3, #2 \n"
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"subs %2, #24 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
:
: "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
);
}
#define HAS_SCALEROWDOWN38_NEON
const uvec8 kShuf38 =
{ 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
const uvec8 kShuf38_2 =
{ 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
const vec16 kMult38_Div6 =
{ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
const vec16 kMult38_Div9 =
{ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
// 32 -> 12
void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vld1.u8 {q3}, [%3] \n"
"1: \n"
"vld1.u8 {d0, d1, d2, d3}, [%0]! \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
"vst1.u8 {d4}, [%1]! \n"
"vst1.u32 {d5[0]}, [%1]! \n"
"subs %2, #12 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(&kShuf38) // %3
: "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
);
}
// 32x3 -> 12x1
void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vld1.u16 {q13}, [%4] \n"
"vld1.u8 {q14}, [%5] \n"
"vld1.u8 {q15}, [%6] \n"
"add r4, %0, %3, lsl #1 \n"
"add %3, %0 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
"vld4.u8 {d16, d17, d18, d19}, [r4]! \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
"vtrn.u8 d0, d1 \n"
"vtrn.u8 d4, d5 \n"
"vtrn.u8 d16, d17 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
"vtrn.u8 d2, d3 \n"
"vtrn.u8 d6, d7 \n"
"vtrn.u8 d18, d19 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
"vpaddl.u8 q0, q0 \n"
"vpaddl.u8 q2, q2 \n"
"vpaddl.u8 q8, q8 \n"
// d3 = 60+70 61+71 62+72 63+73
"vpaddl.u8 d3, d3 \n"
"vpaddl.u8 d7, d7 \n"
"vpaddl.u8 d19, d19 \n"
// combine source lines
"vadd.u16 q0, q2 \n"
"vadd.u16 q0, q8 \n"
"vadd.u16 d4, d3, d7 \n"
"vadd.u16 d4, d19 \n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
// + s[6 + st * 1] + s[7 + st * 1]
// + s[6 + st * 2] + s[7 + st * 2]) / 6
"vqrdmulh.s16 q2, q2, q13 \n"
"vmovn.u16 d4, q2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
// requires expanding from u8 to u16 as the 0,1 and 4,5
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
"vmovl.u8 q1, d2 \n"
"vmovl.u8 q3, d6 \n"
"vmovl.u8 q9, d18 \n"
// combine source lines
"vadd.u16 q1, q3 \n"
"vadd.u16 q1, q9 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
"vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
"vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
"vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
"vqrdmulh.s16 q0, q0, q15 \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
"vmov.u8 d2, d4 \n"
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
"vst1.u8 {d3}, [%1]! \n"
"vst1.u32 {d4[0]}, [%1]! \n"
"subs %2, #12 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
: "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2), // %5
"r"(&kMult38_Div9) // %6
: "r4", "q0", "q1", "q2", "q3", "q8", "q9",
"q13", "q14", "q15", "memory", "cc"
);
}
// 32x2 -> 12x1
void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vld1.u16 {q13}, [%4] \n"
"vld1.u8 {q14}, [%5] \n"
"add %3, %0 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
"vtrn.u8 d0, d1 \n"
"vtrn.u8 d4, d5 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
"vtrn.u8 d2, d3 \n"
"vtrn.u8 d6, d7 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
"vpaddl.u8 q0, q0 \n"
"vpaddl.u8 q2, q2 \n"
// d3 = 60+70 61+71 62+72 63+73
"vpaddl.u8 d3, d3 \n"
"vpaddl.u8 d7, d7 \n"
// combine source lines
"vadd.u16 q0, q2 \n"
"vadd.u16 d4, d3, d7 \n"
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
"vqrshrn.u16 d4, q2, #2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
// requires expanding from u8 to u16 as the 0,1 and 4,5
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
"vmovl.u8 q1, d2 \n"
"vmovl.u8 q3, d6 \n"
// combine source lines
"vadd.u16 q1, q3 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
"vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
"vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
"vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
"vqrdmulh.s16 q0, q0, q13 \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
"vmov.u8 d2, d4 \n"
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
"vst1.u8 {d3}, [%1]! \n"
"vst1.u32 {d4[0]}, [%1]! \n"
"subs %2, #12 \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
: "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2) // %5
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
);
}
// 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
asm volatile (
"cmp %4, #0 \n"
"beq 2f \n"
"add %2, %1 \n"
"cmp %4, #128 \n"
"beq 3f \n"
"vdup.8 d5, %4 \n"
"rsb %4, #256 \n"
"vdup.8 d4, %4 \n"
"1: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n"
"vmull.u8 q13, d0, d4 \n"
"vmull.u8 q14, d1, d4 \n"
"vmlal.u8 q13, d2, d5 \n"
"vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 1b \n"
"b 4f \n"
"2: \n"
"vld1.u8 {q0}, [%1]! \n"
"subs %3, #16 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 2b \n"
"b 4f \n"
"3: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 3b \n"
"4: \n"
"vst1.u8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
"+r"(dst_width), // %3
"+r"(source_y_fraction) // %4
:
: "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
);
}
#endif // __ARM_NEON__
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@ -76,8 +76,8 @@ static int ARGBTestRotate(int src_width, int src_height,
printf("filter %d - %8d us C - %8d us OPT\n",
mode, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
// C version may be a little off from the optimized. Order of
// operations may introduce rounding somewhere. So do a difference
// C version may be a little off from the optimized. Order of
// operations may introduce rounding somewhere. So do a difference
// of the buffers and look to see that the max difference isn't
// over 2.
int max_diff = 0;

View File

@ -80,8 +80,8 @@ static int ARGBTestFilter(int src_width, int src_height,
printf("filter %d - %8d us C - %8d us OPT\n",
f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
// C version may be a little off from the optimized. Order of
// operations may introduce rounding somewhere. So do a difference
// C version may be a little off from the optimized. Order of
// operations may introduce rounding somewhere. So do a difference
// of the buffers and look to see that the max difference isn't
// over 2.
int max_diff = 0;

View File

@ -118,8 +118,8 @@ static int TestFilter(int src_width, int src_height,
printf("filter %d - %8d us C - %8d us OPT\n",
f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
// C version may be a little off from the optimized. Order of
// operations may introduce rounding somewhere. So do a difference
// C version may be a little off from the optimized. Order of
// operations may introduce rounding somewhere. So do a difference
// of the buffers and look to see that the max difference isn't
// over 2.
int max_diff = 0;