clang-format libyuv

BUG=libyuv:654
R=kjellander@chromium.org

Review URL: https://codereview.chromium.org/2469353005 .
This commit is contained in:
Frank Barchard 2016-11-07 17:37:23 -08:00
parent f2c27dafa2
commit e62309f259
79 changed files with 16395 additions and 13666 deletions

6
.clang-format Normal file
View File

@ -0,0 +1,6 @@
# Defines the Chromium style for automatic reformatting.
# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
BasedOnStyle: Chromium
---
Language: Java
BasedOnStyle: Google

View File

@ -26,31 +26,31 @@
typedef unsigned __int64 uint64;
typedef __int64 int64;
#ifndef INT64_C
#define INT64_C(x) x ## I64
#define INT64_C(x) x##I64
#endif
#ifndef UINT64_C
#define UINT64_C(x) x ## UI64
#define UINT64_C(x) x##UI64
#endif
#define INT64_F "I64"
#else // COMPILER_MSVC
#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
typedef unsigned long uint64; // NOLINT
typedef long int64; // NOLINT
typedef long int64; // NOLINT
#ifndef INT64_C
#define INT64_C(x) x ## L
#define INT64_C(x) x##L
#endif
#ifndef UINT64_C
#define UINT64_C(x) x ## UL
#define UINT64_C(x) x##UL
#endif
#define INT64_F "l"
#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
typedef unsigned long long uint64; // NOLINT
typedef long long int64; // NOLINT
typedef long long int64; // NOLINT
#ifndef INT64_C
#define INT64_C(x) x ## LL
#define INT64_C(x) x##LL
#endif
#ifndef UINT64_C
#define UINT64_C(x) x ## ULL
#define UINT64_C(x) x##ULL
#endif
#define INT64_F "ll"
#endif // __LP64__
@ -58,15 +58,15 @@ typedef long long int64; // NOLINT
typedef unsigned int uint32;
typedef int int32;
typedef unsigned short uint16; // NOLINT
typedef short int16; // NOLINT
typedef short int16; // NOLINT
typedef unsigned char uint8;
typedef signed char int8;
#endif // INT_TYPES_DEFINED
#endif // GG_LONGLONG
// Detect compiler is for x86 or x64.
#if defined(__x86_64__) || defined(_M_X64) || \
defined(__i386__) || defined(_M_IX86)
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
defined(_M_IX86)
#define CPU_X86 1
#endif
// Detect compiler is for ARM.
@ -76,12 +76,12 @@ typedef signed char int8;
#ifndef ALIGNP
#ifdef __cplusplus
#define ALIGNP(p, t) \
(reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
((t) - 1)) & ~((t) - 1))))
#define ALIGNP(p, t) \
reinterpret_cast<uint8*>( \
((reinterpret_cast<uintptr_t>(p) + ((t)-1)) & ~((t)-1)))
#else
#define ALIGNP(p, t) \
((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1)))) /* NOLINT */
(uint8*)((((uintptr_t)(p) + ((t)-1)) & ~((t)-1))) /* NOLINT */
#endif
#endif
@ -95,9 +95,9 @@ typedef signed char int8;
#define LIBYUV_API
#endif // LIBYUV_BUILDING_SHARED_LIBRARY
#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
(defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
defined(LIBYUV_USING_SHARED_LIBRARY))
#define LIBYUV_API __attribute__ ((visibility ("default")))
(defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
defined(LIBYUV_USING_SHARED_LIBRARY))
#define LIBYUV_API __attribute__((visibility("default")))
#else
#define LIBYUV_API
#endif // __GNUC__
@ -108,10 +108,9 @@ typedef signed char int8;
#define LIBYUV_TRUE 1
// Visual C x86 or GCC little endian.
#if defined(__x86_64__) || defined(_M_X64) || \
defined(__i386__) || defined(_M_IX86) || \
defined(__arm__) || defined(_M_ARM) || \
(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \
(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
#define LIBYUV_LITTLE_ENDIAN
#endif

View File

@ -29,13 +29,15 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
// Sum Square Error - used to compute Mean Square Error or PSNR.
LIBYUV_API
uint64 ComputeSumSquareError(const uint8* src_a,
const uint8* src_b, int count);
uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, int count);
LIBYUV_API
uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height);
uint64 ComputeSumSquareErrorPlane(const uint8* src_a,
int stride_a,
const uint8* src_b,
int stride_b,
int width,
int height);
static const int kMaxPsnr = 128;
@ -43,32 +45,52 @@ LIBYUV_API
double SumSquareErrorToPsnr(uint64 sse, uint64 count);
LIBYUV_API
double CalcFramePsnr(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height);
double CalcFramePsnr(const uint8* src_a,
int stride_a,
const uint8* src_b,
int stride_b,
int width,
int height);
LIBYUV_API
double I420Psnr(const uint8* src_y_a, int stride_y_a,
const uint8* src_u_a, int stride_u_a,
const uint8* src_v_a, int stride_v_a,
const uint8* src_y_b, int stride_y_b,
const uint8* src_u_b, int stride_u_b,
const uint8* src_v_b, int stride_v_b,
int width, int height);
double I420Psnr(const uint8* src_y_a,
int stride_y_a,
const uint8* src_u_a,
int stride_u_a,
const uint8* src_v_a,
int stride_v_a,
const uint8* src_y_b,
int stride_y_b,
const uint8* src_u_b,
int stride_u_b,
const uint8* src_v_b,
int stride_v_b,
int width,
int height);
LIBYUV_API
double CalcFrameSsim(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height);
double CalcFrameSsim(const uint8* src_a,
int stride_a,
const uint8* src_b,
int stride_b,
int width,
int height);
LIBYUV_API
double I420Ssim(const uint8* src_y_a, int stride_y_a,
const uint8* src_u_a, int stride_u_a,
const uint8* src_v_a, int stride_v_a,
const uint8* src_y_b, int stride_y_b,
const uint8* src_u_b, int stride_u_b,
const uint8* src_v_b, int stride_v_b,
int width, int height);
double I420Ssim(const uint8* src_y_a,
int stride_y_a,
const uint8* src_u_a,
int stride_u_a,
const uint8* src_v_a,
int stride_v_a,
const uint8* src_y_b,
int stride_y_b,
const uint8* src_u_b,
int stride_u_b,
const uint8* src_v_b,
int stride_v_b,
int width,
int height);
#ifdef __cplusplus
} // extern "C"

View File

@ -30,8 +30,8 @@ extern "C" {
#endif
// Visual C 2012 required for AVX2.
#if defined(_M_IX86) && !defined(__clang__) && \
defined(_MSC_VER) && _MSC_VER >= 1700
#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
_MSC_VER >= 1700
#define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012
@ -42,8 +42,8 @@ extern "C" {
#endif // clang >= 3.4
#endif // __clang__
#if !defined(LIBYUV_DISABLE_X86) && \
defined(_M_IX86) && (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
#define HAS_HASHDJB2_AVX2
#endif

View File

@ -16,8 +16,8 @@
#include "libyuv/rotate.h" // For enum RotationMode.
// TODO(fbarchard): fix WebRTC source to include following libyuv headers:
#include "libyuv/convert_argb.h" // For WebRTC I420ToARGB. b/620
#include "libyuv/convert_from.h" // For WebRTC ConvertFromI420. b/620
#include "libyuv/convert_argb.h" // For WebRTC I420ToARGB. b/620
#include "libyuv/convert_from.h" // For WebRTC ConvertFromI420. b/620
#include "libyuv/planar_functions.h" // For WebRTC I420Rect, CopyPlane. b/618
#ifdef __cplusplus
@ -27,185 +27,295 @@ extern "C" {
// Convert I444 to I420.
LIBYUV_API
int I444ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I444ToI420(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert I422 to I420.
LIBYUV_API
int I422ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I422ToI420(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Copy I420 to I420.
#define I420ToI420 I420Copy
LIBYUV_API
int I420Copy(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I420Copy(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert I400 (grey) to I420.
LIBYUV_API
int I400ToI420(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I400ToI420(const uint8* src_y,
int src_stride_y,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
#define J400ToJ420 I400ToI420
// Convert NV12 to I420.
LIBYUV_API
int NV12ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int NV12ToI420(const uint8* src_y,
int src_stride_y,
const uint8* src_uv,
int src_stride_uv,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert NV21 to I420.
LIBYUV_API
int NV21ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_vu, int src_stride_vu,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int NV21ToI420(const uint8* src_y,
int src_stride_y,
const uint8* src_vu,
int src_stride_vu,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert YUY2 to I420.
LIBYUV_API
int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int YUY2ToI420(const uint8* src_yuy2,
int src_stride_yuy2,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert UYVY to I420.
LIBYUV_API
int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int UYVYToI420(const uint8* src_uyvy,
int src_stride_uyvy,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert M420 to I420.
LIBYUV_API
int M420ToI420(const uint8* src_m420, int src_stride_m420,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int M420ToI420(const uint8* src_m420,
int src_stride_m420,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert Android420 to I420.
LIBYUV_API
int Android420ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
int Android420ToI420(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
int pixel_stride_uv,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// ARGB little endian (bgra in memory) to I420.
LIBYUV_API
int ARGBToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int ARGBToI420(const uint8* src_frame,
int src_stride_frame,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// BGRA little endian (argb in memory) to I420.
LIBYUV_API
int BGRAToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int BGRAToI420(const uint8* src_frame,
int src_stride_frame,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// ABGR little endian (rgba in memory) to I420.
LIBYUV_API
int ABGRToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int ABGRToI420(const uint8* src_frame,
int src_stride_frame,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// RGBA little endian (abgr in memory) to I420.
LIBYUV_API
int RGBAToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int RGBAToI420(const uint8* src_frame,
int src_stride_frame,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// RGB little endian (bgr in memory) to I420.
LIBYUV_API
int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int RGB24ToI420(const uint8* src_frame,
int src_stride_frame,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// RGB big endian (rgb in memory) to I420.
LIBYUV_API
int RAWToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int RAWToI420(const uint8* src_frame,
int src_stride_frame,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// RGB16 (RGBP fourcc) little endian to I420.
LIBYUV_API
int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int RGB565ToI420(const uint8* src_frame,
int src_stride_frame,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// RGB15 (RGBO fourcc) little endian to I420.
LIBYUV_API
int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int ARGB1555ToI420(const uint8* src_frame,
int src_stride_frame,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// RGB12 (R444 fourcc) little endian to I420.
LIBYUV_API
int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int ARGB4444ToI420(const uint8* src_frame,
int src_stride_frame,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
#ifdef HAVE_JPEG
// src_width/height provided by capture.
// dst_width/height for clipping determine final size.
LIBYUV_API
int MJPGToI420(const uint8* sample, size_t sample_size,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int src_width, int src_height,
int dst_width, int dst_height);
int MJPGToI420(const uint8* sample,
size_t sample_size,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int src_width,
int src_height,
int dst_width,
int dst_height);
// Query size of MJPG in pixels.
LIBYUV_API
int MJPGSize(const uint8* sample, size_t sample_size,
int* width, int* height);
int MJPGSize(const uint8* sample, size_t sample_size, int* width, int* height);
#endif
// Convert camera sample to I420 with cropping, rotation and vertical flip.
@ -231,13 +341,20 @@ int MJPGSize(const uint8* sample, size_t sample_size,
// "format" is a fourcc. ie 'I420', 'YUY2'
// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
LIBYUV_API
int ConvertToI420(const uint8* src_frame, size_t src_size,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int crop_x, int crop_y,
int src_width, int src_height,
int crop_width, int crop_height,
int ConvertToI420(const uint8* src_frame,
size_t src_size,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int crop_x,
int crop_y,
int src_width,
int src_height,
int crop_width,
int crop_height,
enum RotationMode rotation,
uint32 format);

View File

@ -30,246 +30,384 @@ extern "C" {
// Copy ARGB to ARGB.
LIBYUV_API
int ARGBCopy(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBCopy(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert I420 to ARGB.
LIBYUV_API
int I420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int I420ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Duplicate prototype for function in convert_from.h for remoting.
LIBYUV_API
int I420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int I420ToABGR(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert I422 to ARGB.
LIBYUV_API
int I422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int I422ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert I444 to ARGB.
LIBYUV_API
int I444ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int I444ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert J444 to ARGB.
LIBYUV_API
int J444ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int J444ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert I444 to ABGR.
LIBYUV_API
int I444ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
int I444ToABGR(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert I420 with Alpha to preattenuated ARGB.
LIBYUV_API
int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_argb, int dst_stride_argb,
int width, int height, int attenuate);
int I420AlphaToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
const uint8* src_a,
int src_stride_a,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height,
int attenuate);
// Convert I420 with Alpha to preattenuated ABGR.
LIBYUV_API
int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height, int attenuate);
int I420AlphaToABGR(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
const uint8* src_a,
int src_stride_a,
uint8* dst_abgr,
int dst_stride_abgr,
int width,
int height,
int attenuate);
// Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
LIBYUV_API
int I400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int I400ToARGB(const uint8* src_y,
int src_stride_y,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert J400 (jpeg grey) to ARGB.
LIBYUV_API
int J400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int J400ToARGB(const uint8* src_y,
int src_stride_y,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Alias.
#define YToARGB I400ToARGB
// Convert NV12 to ARGB.
LIBYUV_API
int NV12ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int NV12ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_uv,
int src_stride_uv,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert NV21 to ARGB.
LIBYUV_API
int NV21ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_vu, int src_stride_vu,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int NV21ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_vu,
int src_stride_vu,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert M420 to ARGB.
LIBYUV_API
int M420ToARGB(const uint8* src_m420, int src_stride_m420,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int M420ToARGB(const uint8* src_m420,
int src_stride_m420,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert YUY2 to ARGB.
LIBYUV_API
int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int YUY2ToARGB(const uint8* src_yuy2,
int src_stride_yuy2,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert UYVY to ARGB.
LIBYUV_API
int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int UYVYToARGB(const uint8* src_uyvy,
int src_stride_uyvy,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert J420 to ARGB.
LIBYUV_API
int J420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int J420ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert J422 to ARGB.
LIBYUV_API
int J422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int J422ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert J420 to ABGR.
LIBYUV_API
int J420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
int J420ToABGR(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert J422 to ABGR.
LIBYUV_API
int J422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
int J422ToABGR(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert H420 to ARGB.
LIBYUV_API
int H420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int H420ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert H422 to ARGB.
LIBYUV_API
int H422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int H422ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert H420 to ABGR.
LIBYUV_API
int H420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
int H420ToABGR(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert H422 to ABGR.
LIBYUV_API
int H422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
int H422ToABGR(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// BGRA little endian (argb in memory) to ARGB.
LIBYUV_API
int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int BGRAToARGB(const uint8* src_frame,
int src_stride_frame,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// ABGR little endian (rgba in memory) to ARGB.
LIBYUV_API
int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ABGRToARGB(const uint8* src_frame,
int src_stride_frame,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// RGBA little endian (abgr in memory) to ARGB.
LIBYUV_API
int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int RGBAToARGB(const uint8* src_frame,
int src_stride_frame,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Deprecated function name.
#define BG24ToARGB RGB24ToARGB
// RGB little endian (bgr in memory) to ARGB.
LIBYUV_API
int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int RGB24ToARGB(const uint8* src_frame,
int src_stride_frame,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// RGB big endian (rgb in memory) to ARGB.
LIBYUV_API
int RAWToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int RAWToARGB(const uint8* src_frame,
int src_stride_frame,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// RGB16 (RGBP fourcc) little endian to ARGB.
LIBYUV_API
int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int RGB565ToARGB(const uint8* src_frame,
int src_stride_frame,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// RGB15 (RGBO fourcc) little endian to ARGB.
LIBYUV_API
int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGB1555ToARGB(const uint8* src_frame,
int src_stride_frame,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// RGB12 (R444 fourcc) little endian to ARGB.
LIBYUV_API
int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGB4444ToARGB(const uint8* src_frame,
int src_stride_frame,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
#ifdef HAVE_JPEG
// src_width/height provided by capture
// dst_width/height for clipping determine final size.
LIBYUV_API
int MJPGToARGB(const uint8* sample, size_t sample_size,
uint8* dst_argb, int dst_stride_argb,
int src_width, int src_height,
int dst_width, int dst_height);
int MJPGToARGB(const uint8* sample,
size_t sample_size,
uint8* dst_argb,
int dst_stride_argb,
int src_width,
int src_height,
int dst_width,
int dst_height);
#endif
// Convert camera sample to ARGB with cropping, rotation and vertical flip.
@ -295,11 +433,16 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
// "format" is a fourcc. ie 'I420', 'YUY2'
// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
LIBYUV_API
int ConvertToARGB(const uint8* src_frame, size_t src_size,
uint8* dst_argb, int dst_stride_argb,
int crop_x, int crop_y,
int src_width, int src_height,
int crop_width, int crop_height,
int ConvertToARGB(const uint8* src_frame,
size_t src_size,
uint8* dst_argb,
int dst_stride_argb,
int crop_x,
int crop_y,
int src_width,
int src_height,
int crop_width,
int crop_height,
enum RotationMode rotation,
uint32 format);

View File

@ -24,142 +24,237 @@ extern "C" {
// I420Copy in convert to I420ToI420.
LIBYUV_API
int I420ToI422(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I420ToI422(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
LIBYUV_API
int I420ToI444(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I420ToI444(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
LIBYUV_API
int I400Copy(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
int I400Copy(const uint8* src_y,
int src_stride_y,
uint8* dst_y,
int dst_stride_y,
int width,
int height);
LIBYUV_API
int I420ToNV12(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height);
int I420ToNV12(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_uv,
int dst_stride_uv,
int width,
int height);
LIBYUV_API
int I420ToNV21(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_vu, int dst_stride_vu,
int width, int height);
int I420ToNV21(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_vu,
int dst_stride_vu,
int width,
int height);
LIBYUV_API
int I420ToYUY2(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
int I420ToYUY2(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
LIBYUV_API
int I420ToUYVY(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
int I420ToUYVY(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
LIBYUV_API
int I420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int I420ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
LIBYUV_API
int I420ToBGRA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int I420ToBGRA(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
LIBYUV_API
int I420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int I420ToABGR(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
LIBYUV_API
int I420ToRGBA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgba, int dst_stride_rgba,
int width, int height);
int I420ToRGBA(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_rgba,
int dst_stride_rgba,
int width,
int height);
LIBYUV_API
int I420ToRGB24(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
int I420ToRGB24(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
LIBYUV_API
int I420ToRAW(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
int I420ToRAW(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
LIBYUV_API
int I420ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
int I420ToRGB565(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
// Values in dither matrix from 0 to 7 recommended.
// The order of the dither matrix is first byte is upper left.
LIBYUV_API
int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
const uint8* dither4x4, int width, int height);
int I420ToRGB565Dither(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
const uint8* dither4x4,
int width,
int height);
LIBYUV_API
int I420ToARGB1555(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
int I420ToARGB1555(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
LIBYUV_API
int I420ToARGB4444(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
int I420ToARGB4444(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
// Convert I420 to specified format.
// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
LIBYUV_API
int ConvertFromI420(const uint8* y, int y_stride,
const uint8* u, int u_stride,
const uint8* v, int v_stride,
uint8* dst_sample, int dst_sample_stride,
int width, int height,
int ConvertFromI420(const uint8* y,
int y_stride,
const uint8* u,
int u_stride,
const uint8* v,
int v_stride,
uint8* dst_sample,
int dst_sample_stride,
int width,
int height,
uint32 format);
#ifdef __cplusplus

View File

@ -21,45 +21,66 @@ extern "C" {
// Copy ARGB to ARGB.
#define ARGBToARGB ARGBCopy
LIBYUV_API
int ARGBCopy(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBCopy(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert ARGB To BGRA.
LIBYUV_API
int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
uint8* dst_bgra, int dst_stride_bgra,
int width, int height);
int ARGBToBGRA(const uint8* src_argb,
int src_stride_argb,
uint8* dst_bgra,
int dst_stride_bgra,
int width,
int height);
// Convert ARGB To ABGR.
LIBYUV_API
int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
int ARGBToABGR(const uint8* src_argb,
int src_stride_argb,
uint8* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert ARGB To RGBA.
LIBYUV_API
int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgba, int dst_stride_rgba,
int width, int height);
int ARGBToRGBA(const uint8* src_argb,
int src_stride_argb,
uint8* dst_rgba,
int dst_stride_rgba,
int width,
int height);
// Convert ARGB To RGB24.
LIBYUV_API
int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb24, int dst_stride_rgb24,
int width, int height);
int ARGBToRGB24(const uint8* src_argb,
int src_stride_argb,
uint8* dst_rgb24,
int dst_stride_rgb24,
int width,
int height);
// Convert ARGB To RAW.
LIBYUV_API
int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb, int dst_stride_rgb,
int width, int height);
int ARGBToRAW(const uint8* src_argb,
int src_stride_argb,
uint8* dst_rgb,
int dst_stride_rgb,
int width,
int height);
// Convert ARGB To RGB565.
LIBYUV_API
int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
int ARGBToRGB565(const uint8* src_argb,
int src_stride_argb,
uint8* dst_rgb565,
int dst_stride_rgb565,
int width,
int height);
// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
// Values in dither matrix from 0 to 7 recommended.
@ -67,112 +88,174 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
// const uint8(*dither)[4][4];
LIBYUV_API
int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565,
const uint8* dither4x4, int width, int height);
int ARGBToRGB565Dither(const uint8* src_argb,
int src_stride_argb,
uint8* dst_rgb565,
int dst_stride_rgb565,
const uint8* dither4x4,
int width,
int height);
// Convert ARGB To ARGB1555.
LIBYUV_API
int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb1555, int dst_stride_argb1555,
int width, int height);
int ARGBToARGB1555(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb1555,
int dst_stride_argb1555,
int width,
int height);
// Convert ARGB To ARGB4444.
LIBYUV_API
int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb4444, int dst_stride_argb4444,
int width, int height);
int ARGBToARGB4444(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb4444,
int dst_stride_argb4444,
int width,
int height);
// Convert ARGB To I444.
LIBYUV_API
int ARGBToI444(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int ARGBToI444(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert ARGB To I422.
LIBYUV_API
int ARGBToI422(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int ARGBToI422(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert ARGB To I420. (also in convert.h)
LIBYUV_API
int ARGBToI420(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int ARGBToI420(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert ARGB to J420. (JPeg full range I420).
LIBYUV_API
int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
uint8* dst_yj, int dst_stride_yj,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int ARGBToJ420(const uint8* src_argb,
int src_stride_argb,
uint8* dst_yj,
int dst_stride_yj,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert ARGB to J422.
LIBYUV_API
int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
uint8* dst_yj, int dst_stride_yj,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int ARGBToJ422(const uint8* src_argb,
int src_stride_argb,
uint8* dst_yj,
int dst_stride_yj,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert ARGB to J400. (JPeg full range).
LIBYUV_API
int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
uint8* dst_yj, int dst_stride_yj,
int width, int height);
int ARGBToJ400(const uint8* src_argb,
int src_stride_argb,
uint8* dst_yj,
int dst_stride_yj,
int width,
int height);
// Convert ARGB to I400.
LIBYUV_API
int ARGBToI400(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
int width, int height);
int ARGBToI400(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
int width,
int height);
// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
LIBYUV_API
int ARGBToG(const uint8* src_argb, int src_stride_argb,
uint8* dst_g, int dst_stride_g,
int width, int height);
int ARGBToG(const uint8* src_argb,
int src_stride_argb,
uint8* dst_g,
int dst_stride_g,
int width,
int height);
// Convert ARGB To NV12.
LIBYUV_API
int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height);
int ARGBToNV12(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
uint8* dst_uv,
int dst_stride_uv,
int width,
int height);
// Convert ARGB To NV21.
LIBYUV_API
int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_vu, int dst_stride_vu,
int width, int height);
int ARGBToNV21(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
uint8* dst_vu,
int dst_stride_vu,
int width,
int height);
// Convert ARGB To NV21.
LIBYUV_API
int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_vu, int dst_stride_vu,
int width, int height);
int ARGBToNV21(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
uint8* dst_vu,
int dst_stride_vu,
int width,
int height);
// Convert ARGB To YUY2.
LIBYUV_API
int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
uint8* dst_yuy2, int dst_stride_yuy2,
int width, int height);
int ARGBToYUY2(const uint8* src_argb,
int src_stride_argb,
uint8* dst_yuy2,
int dst_stride_yuy2,
int width,
int height);
// Convert ARGB To UYVY.
LIBYUV_API
int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
uint8* dst_uyvy, int dst_stride_uyvy,
int width, int height);
int ARGBToUYVY(const uint8* src_argb,
int src_stride_argb,
uint8* dst_uyvy,
int dst_stride_uyvy,
int width,
int height);
#ifdef __cplusplus
} // extern "C"

View File

@ -12,88 +12,86 @@
#define INCLUDE_LIBYUV_MACROS_MSA_H_
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#include <stdint.h>
#include <msa.h>
#include <stdint.h>
#if (__mips_isa_rev >= 6)
#define LW(psrc) ({ \
uint8* psrc_lw_m = (uint8*) (psrc); /* NOLINT */ \
uint32 val_m; \
asm volatile ( \
"lw %[val_m], %[psrc_lw_m] \n\t" \
: [val_m] "=r" (val_m) \
: [psrc_lw_m] "m" (*psrc_lw_m) \
); \
val_m; \
#define LW(psrc) \
({ \
uint8* psrc_lw_m = (uint8*)(psrc); /* NOLINT */ \
uint32 val_m; \
asm volatile("lw %[val_m], %[psrc_lw_m] \n\t" \
: [val_m] "=r"(val_m) \
: [psrc_lw_m] "m"(*psrc_lw_m)); \
val_m; \
})
#if (__mips == 64)
#define LD(psrc) ({ \
uint8* psrc_ld_m = (uint8*) (psrc); /* NOLINT */ \
uint64 val_m = 0; \
asm volatile ( \
"ld %[val_m], %[psrc_ld_m] \n\t" \
: [val_m] "=r" (val_m) \
: [psrc_ld_m] "m" (*psrc_ld_m) \
); \
val_m; \
})
#else // !(__mips == 64)
#define LD(psrc) ({ \
uint8* psrc_ld_m = (uint8*) (psrc); /* NOLINT */ \
uint32 val0_m, val1_m; \
uint64 val_m = 0; \
val0_m = LW(psrc_ld_m); \
val1_m = LW(psrc_ld_m + 4); \
val_m = (uint64) (val1_m); /* NOLINT */ \
val_m = (uint64) ((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
val_m = (uint64) (val_m | (uint64) val0_m); /* NOLINT */ \
val_m; \
})
#endif // (__mips == 64)
#else // !(__mips_isa_rev >= 6)
#define LW(psrc) ({ \
uint8* psrc_lw_m = (uint8*) (psrc); /* NOLINT */ \
uint32 val_m; \
asm volatile ( \
"ulw %[val_m], %[psrc_lw_m] \n\t" \
: [val_m] "=r" (val_m) \
: [psrc_lw_m] "m" (*psrc_lw_m) \
); \
val_m; \
#if (__mips == 64)
#define LD(psrc) \
({ \
uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
uint64 val_m = 0; \
asm volatile("ld %[val_m], %[psrc_ld_m] \n\t" \
: [val_m] "=r"(val_m) \
: [psrc_ld_m] "m"(*psrc_ld_m)); \
val_m; \
})
#else // !(__mips == 64)
#define LD(psrc) \
({ \
uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
uint32 val0_m, val1_m; \
uint64 val_m = 0; \
val0_m = LW(psrc_ld_m); \
val1_m = LW(psrc_ld_m + 4); \
val_m = (uint64)(val1_m); /* NOLINT */ \
val_m = (uint64)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
val_m = (uint64)(val_m | (uint64)val0_m); /* NOLINT */ \
val_m; \
})
#endif // (__mips == 64)
#else // !(__mips_isa_rev >= 6)
#define LW(psrc) \
({ \
uint8* psrc_lw_m = (uint8*)(psrc); /* NOLINT */ \
uint32 val_m; \
asm volatile("ulw %[val_m], %[psrc_lw_m] \n\t" \
: [val_m] "=r"(val_m) \
: [psrc_lw_m] "m"(*psrc_lw_m)); \
val_m; \
})
#if (__mips == 64)
#define LD(psrc) ({ \
uint8* psrc_ld_m = (uint8*) (psrc); /* NOLINT */ \
uint64 val_m = 0; \
asm volatile ( \
"uld %[val_m], %[psrc_ld_m] \n\t" \
: [val_m] "=r" (val_m) \
: [psrc_ld_m] "m" (*psrc_ld_m) \
); \
val_m; \
})
#else // !(__mips == 64)
#define LD(psrc) ({ \
uint8* psrc_ld_m = (uint8*) (psrc); /* NOLINT */ \
uint32 val0_m, val1_m; \
uint64 val_m = 0; \
val0_m = LW(psrc_ld_m); \
val1_m = LW(psrc_ld_m + 4); \
val_m = (uint64) (val1_m); /* NOLINT */ \
val_m = (uint64) ((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
val_m = (uint64) (val_m | (uint64) val0_m); /* NOLINT */ \
val_m; \
})
#endif // (__mips == 64)
#if (__mips == 64)
#define LD(psrc) \
({ \
uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
uint64 val_m = 0; \
asm volatile("uld %[val_m], %[psrc_ld_m] \n\t" \
: [val_m] "=r"(val_m) \
: [psrc_ld_m] "m"(*psrc_ld_m)); \
val_m; \
})
#else // !(__mips == 64)
#define LD(psrc) \
({ \
uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
uint32 val0_m, val1_m; \
uint64 val_m = 0; \
val0_m = LW(psrc_ld_m); \
val1_m = LW(psrc_ld_m + 4); \
val_m = (uint64)(val1_m); /* NOLINT */ \
val_m = (uint64)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
val_m = (uint64)(val_m | (uint64)val0_m); /* NOLINT */ \
val_m; \
})
#endif // (__mips == 64)
#endif // (__mips_isa_rev >= 6)
// TODO(fbarchard): Consider removing __VAR_ARGS versions.
#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
/* Description : Load two vectors with 16 'byte' sized elements
@ -103,16 +101,18 @@
Details : Load 16 byte elements in 'out0' from (psrc)
Load 16 byte elements in 'out1' from (psrc + stride)
*/
#define LD_B2(RTYPE, psrc, stride, out0, out1) { \
out0 = LD_B(RTYPE, (psrc)); \
out1 = LD_B(RTYPE, (psrc) + stride); \
}
#define LD_B2(RTYPE, psrc, stride, out0, out1) \
{ \
out0 = LD_B(RTYPE, (psrc)); \
out1 = LD_B(RTYPE, (psrc) + stride); \
}
#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) { \
LD_B2(RTYPE, (psrc), stride, out0, out1); \
LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
}
#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
{ \
LD_B2(RTYPE, (psrc), stride, out0, out1); \
LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
}
#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
/* Description : Store two vectors with stride each having 16 'byte' sized
@ -121,16 +121,18 @@
Details : Store 16 byte elements from 'in0' to (pdst)
Store 16 byte elements from 'in1' to (pdst + stride)
*/
#define ST_B2(RTYPE, in0, in1, pdst, stride) { \
ST_B(RTYPE, in0, (pdst)); \
ST_B(RTYPE, in1, (pdst) + stride); \
}
#define ST_B2(RTYPE, in0, in1, pdst, stride) \
{ \
ST_B(RTYPE, in0, (pdst)); \
ST_B(RTYPE, in1, (pdst) + stride); \
}
#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) { \
ST_B2(RTYPE, in0, in1, (pdst), stride); \
ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
}
#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
{ \
ST_B2(RTYPE, in0, in1, (pdst), stride); \
ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
}
#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly.
@ -141,10 +143,11 @@
Details : Byte elements from 'in0' & 'in1' are copied selectively to
'out0' as per control vector 'mask0'
*/
#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \
out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
}
#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
{ \
out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
}
#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
/* Description : Interleave both left and right half of input vectors
@ -154,12 +157,13 @@
Details : Right half of byte elements from 'in0' and 'in1' are
interleaved and written to 'out0'
*/
#define ILVRL_B2(RTYPE, in0, in1, out0, out1) { \
out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
}
#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
{ \
out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
}
#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
#endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */
#endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */
#endif // INCLUDE_LIBYUV_MACROS_MSA_H_

View File

@ -144,12 +144,16 @@ class LIBYUV_API MJpegDecoder {
// callback function. Each call will get the data for a whole number of
// image scanlines.
// TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque,
int dst_width, int dst_height);
LIBYUV_BOOL DecodeToCallback(CallbackFunction fn,
void* opaque,
int dst_width,
int dst_height);
// The helper function which recognizes the jpeg sub-sampling type.
static JpegSubsamplingType JpegSubsamplingTypeHelper(
int* subsample_x, int* subsample_y, int number_of_components);
int* subsample_x,
int* subsample_y,
int number_of_components);
private:
void AllocOutputBuffers(int num_outbufs);

View File

@ -24,105 +24,164 @@ extern "C" {
// Copy a plane of data.
LIBYUV_API
void CopyPlane(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
void CopyPlane(const uint8* src_y,
int src_stride_y,
uint8* dst_y,
int dst_stride_y,
int width,
int height);
LIBYUV_API
void CopyPlane_16(const uint16* src_y, int src_stride_y,
uint16* dst_y, int dst_stride_y,
int width, int height);
void CopyPlane_16(const uint16* src_y,
int src_stride_y,
uint16* dst_y,
int dst_stride_y,
int width,
int height);
// Set a plane of data to a 32 bit value.
LIBYUV_API
void SetPlane(uint8* dst_y, int dst_stride_y,
int width, int height,
void SetPlane(uint8* dst_y,
int dst_stride_y,
int width,
int height,
uint32 value);
// Split interleaved UV plane into separate U and V planes.
LIBYUV_API
void SplitUVPlane(const uint8* src_uv, int src_stride_uv,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
void SplitUVPlane(const uint8* src_uv,
int src_stride_uv,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Merge separate U and V planes into one interleaved UV plane.
LIBYUV_API
void MergeUVPlane(const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_uv, int dst_stride_uv,
int width, int height);
void MergeUVPlane(const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_uv,
int dst_stride_uv,
int width,
int height);
// Copy I400. Supports inverting.
LIBYUV_API
int I400ToI400(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
int I400ToI400(const uint8* src_y,
int src_stride_y,
uint8* dst_y,
int dst_stride_y,
int width,
int height);
#define J400ToJ400 I400ToI400
// Copy I422 to I422.
#define I422ToI422 I422Copy
LIBYUV_API
int I422Copy(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I422Copy(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Copy I444 to I444.
#define I444ToI444 I444Copy
LIBYUV_API
int I444Copy(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I444Copy(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert YUY2 to I422.
LIBYUV_API
int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int YUY2ToI422(const uint8* src_yuy2,
int src_stride_yuy2,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert UYVY to I422.
LIBYUV_API
int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int UYVYToI422(const uint8* src_uyvy,
int src_stride_uyvy,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
LIBYUV_API
int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height);
int YUY2ToNV12(const uint8* src_yuy2,
int src_stride_yuy2,
uint8* dst_y,
int dst_stride_y,
uint8* dst_uv,
int dst_stride_uv,
int width,
int height);
LIBYUV_API
int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height);
int UYVYToNV12(const uint8* src_uyvy,
int src_stride_uyvy,
uint8* dst_y,
int dst_stride_y,
uint8* dst_uv,
int dst_stride_uv,
int width,
int height);
LIBYUV_API
int YUY2ToY(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
int width, int height);
int YUY2ToY(const uint8* src_yuy2,
int src_stride_yuy2,
uint8* dst_y,
int dst_stride_y,
int width,
int height);
// Convert I420 to I400. (calls CopyPlane ignoring u/v).
LIBYUV_API
int I420ToI400(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
int width, int height);
int I420ToI400(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
int width,
int height);
// Alias
#define J420ToJ400 I420ToI400
@ -130,13 +189,20 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
// I420 mirror.
LIBYUV_API
int I420Mirror(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I420Mirror(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Alias
#define I400ToI400Mirror I400Mirror
@ -144,87 +210,139 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
// I400 mirror. A single plane is mirrored horizontally.
// Pass negative height to achieve 180 degree rotation.
LIBYUV_API
int I400Mirror(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
int I400Mirror(const uint8* src_y,
int src_stride_y,
uint8* dst_y,
int dst_stride_y,
int width,
int height);
// Alias
#define ARGBToARGBMirror ARGBMirror
// ARGB mirror.
LIBYUV_API
int ARGBMirror(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBMirror(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert NV12 to RGB565.
LIBYUV_API
int NV12ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
int NV12ToRGB565(const uint8* src_y,
int src_stride_y,
const uint8* src_uv,
int src_stride_uv,
uint8* dst_rgb565,
int dst_stride_rgb565,
int width,
int height);
// I422ToARGB is in convert_argb.h
// Convert I422 to BGRA.
LIBYUV_API
int I422ToBGRA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_bgra, int dst_stride_bgra,
int width, int height);
int I422ToBGRA(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_bgra,
int dst_stride_bgra,
int width,
int height);
// Convert I422 to ABGR.
LIBYUV_API
int I422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
int I422ToABGR(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert I422 to RGBA.
LIBYUV_API
int I422ToRGBA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgba, int dst_stride_rgba,
int width, int height);
int I422ToRGBA(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_rgba,
int dst_stride_rgba,
int width,
int height);
// Alias
#define RGB24ToRAW RAWToRGB24
LIBYUV_API
int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
uint8* dst_rgb24, int dst_stride_rgb24,
int width, int height);
int RAWToRGB24(const uint8* src_raw,
int src_stride_raw,
uint8* dst_rgb24,
int dst_stride_rgb24,
int width,
int height);
// Draw a rectangle into I420.
LIBYUV_API
int I420Rect(uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int x, int y, int width, int height,
int value_y, int value_u, int value_v);
int I420Rect(uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int x,
int y,
int width,
int height,
int value_y,
int value_u,
int value_v);
// Draw a rectangle into ARGB.
LIBYUV_API
int ARGBRect(uint8* dst_argb, int dst_stride_argb,
int x, int y, int width, int height, uint32 value);
int ARGBRect(uint8* dst_argb,
int dst_stride_argb,
int x,
int y,
int width,
int height,
uint32 value);
// Convert ARGB to gray scale ARGB.
LIBYUV_API
int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBGrayTo(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Make a rectangle of ARGB gray scale.
LIBYUV_API
int ARGBGray(uint8* dst_argb, int dst_stride_argb,
int x, int y, int width, int height);
int ARGBGray(uint8* dst_argb,
int dst_stride_argb,
int x,
int y,
int width,
int height);
// Make a rectangle of ARGB Sepia tone.
LIBYUV_API
int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
int x, int y, int width, int height);
int ARGBSepia(uint8* dst_argb,
int dst_stride_argb,
int x,
int y,
int width,
int height);
// Apply a matrix rotation to each ARGB pixel.
// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
@ -233,10 +351,13 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
// The next 4 coefficients apply to B, G, R, A and produce R of the output.
// The last 4 coefficients apply to B, G, R, A and produce A of the output.
LIBYUV_API
int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int ARGBColorMatrix(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
const int8* matrix_argb,
int width, int height);
int width,
int height);
// Deprecated. Use ARGBColorMatrix instead.
// Apply a matrix rotation to each ARGB pixel.
@ -245,32 +366,47 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
// The next 4 coefficients apply to B, G, R, A and produce G of the output.
// The last 4 coefficients apply to B, G, R, A and produce R of the output.
LIBYUV_API
int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
int RGBColorMatrix(uint8* dst_argb,
int dst_stride_argb,
const int8* matrix_rgb,
int x, int y, int width, int height);
int x,
int y,
int width,
int height);
// Apply a color table each ARGB pixel.
// Table contains 256 ARGB values.
LIBYUV_API
int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
int ARGBColorTable(uint8* dst_argb,
int dst_stride_argb,
const uint8* table_argb,
int x, int y, int width, int height);
int x,
int y,
int width,
int height);
// Apply a color table each ARGB pixel but preserve destination alpha.
// Table contains 256 ARGB values.
LIBYUV_API
int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
int RGBColorTable(uint8* dst_argb,
int dst_stride_argb,
const uint8* table_argb,
int x, int y, int width, int height);
int x,
int y,
int width,
int height);
// Apply a luma/color table each ARGB pixel but preserve destination alpha.
// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
// RGB (YJ style) and C is an 8 bit color component (R, G or B).
LIBYUV_API
int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int ARGBLumaColorTable(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
const uint8* luma_rgb_table,
int width, int height);
int width,
int height);
// Apply a 3 term polynomial to ARGB values.
// poly points to a 4x4 matrix. The first row is constants. The 2nd row is
@ -281,54 +417,80 @@ int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
// A polynomial approximation can be dirived using software such as 'R'.
LIBYUV_API
int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int ARGBPolynomial(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
const float* poly,
int width, int height);
int width,
int height);
// Convert plane of 16 bit shorts to half floats.
// Source values are multiplied by scale before storing as half float.
LIBYUV_API
int HalfFloatPlane(const uint16* src_y, int src_stride_y,
uint16* dst_y, int dst_stride_y,
int HalfFloatPlane(const uint16* src_y,
int src_stride_y,
uint16* dst_y,
int dst_stride_y,
float scale,
int width, int height);
int width,
int height);
// Quantize a rectangle of ARGB. Alpha unaffected.
// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
// interval_size should be a value between 1 and 255.
// interval_offset should be a value between 0 and 255.
LIBYUV_API
int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
int scale, int interval_size, int interval_offset,
int x, int y, int width, int height);
int ARGBQuantize(uint8* dst_argb,
int dst_stride_argb,
int scale,
int interval_size,
int interval_offset,
int x,
int y,
int width,
int height);
// Copy ARGB to ARGB.
LIBYUV_API
int ARGBCopy(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBCopy(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Copy Alpha channel of ARGB to alpha of ARGB.
LIBYUV_API
int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBCopyAlpha(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Extract the alpha channel from ARGB.
LIBYUV_API
int ARGBExtractAlpha(const uint8* src_argb, int src_stride_argb,
uint8* dst_a, int dst_stride_a,
int width, int height);
int ARGBExtractAlpha(const uint8* src_argb,
int src_stride_argb,
uint8* dst_a,
int dst_stride_a,
int width,
int height);
// Copy Y channel to Alpha of ARGB.
LIBYUV_API
int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBCopyYToAlpha(const uint8* src_y,
int src_stride_y,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
typedef void (*ARGBBlendRow)(const uint8* src_argb0,
const uint8* src_argb1,
uint8* dst_argb,
int width);
// Get function to Alpha Blend ARGB pixels and store to destination.
LIBYUV_API
@ -338,92 +500,143 @@ ARGBBlendRow GetARGBBlend();
// Source is pre-multiplied by alpha using ARGBAttenuate.
// Alpha of destination is set to 255.
LIBYUV_API
int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBBlend(const uint8* src_argb0,
int src_stride_argb0,
const uint8* src_argb1,
int src_stride_argb1,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Alpha Blend plane and store to destination.
// Source is not pre-multiplied by alpha.
LIBYUV_API
int BlendPlane(const uint8* src_y0, int src_stride_y0,
const uint8* src_y1, int src_stride_y1,
const uint8* alpha, int alpha_stride,
uint8* dst_y, int dst_stride_y,
int width, int height);
int BlendPlane(const uint8* src_y0,
int src_stride_y0,
const uint8* src_y1,
int src_stride_y1,
const uint8* alpha,
int alpha_stride,
uint8* dst_y,
int dst_stride_y,
int width,
int height);
// Alpha Blend YUV images and store to destination.
// Source is not pre-multiplied by alpha.
// Alpha is full width x height and subsampled to half size to apply to UV.
LIBYUV_API
int I420Blend(const uint8* src_y0, int src_stride_y0,
const uint8* src_u0, int src_stride_u0,
const uint8* src_v0, int src_stride_v0,
const uint8* src_y1, int src_stride_y1,
const uint8* src_u1, int src_stride_u1,
const uint8* src_v1, int src_stride_v1,
const uint8* alpha, int alpha_stride,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I420Blend(const uint8* src_y0,
int src_stride_y0,
const uint8* src_u0,
int src_stride_u0,
const uint8* src_v0,
int src_stride_v0,
const uint8* src_y1,
int src_stride_y1,
const uint8* src_u1,
int src_stride_u1,
const uint8* src_v1,
int src_stride_v1,
const uint8* alpha,
int alpha_stride,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
LIBYUV_API
int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBMultiply(const uint8* src_argb0,
int src_stride_argb0,
const uint8* src_argb1,
int src_stride_argb1,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Add ARGB image with ARGB image. Saturates to 255.
LIBYUV_API
int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBAdd(const uint8* src_argb0,
int src_stride_argb0,
const uint8* src_argb1,
int src_stride_argb1,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
LIBYUV_API
int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBSubtract(const uint8* src_argb0,
int src_stride_argb0,
const uint8* src_argb1,
int src_stride_argb1,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert I422 to YUY2.
LIBYUV_API
int I422ToYUY2(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
int I422ToYUY2(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
// Convert I422 to UYVY.
LIBYUV_API
int I422ToUYVY(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
int I422ToUYVY(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
// Convert unattentuated ARGB to preattenuated ARGB.
LIBYUV_API
int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBAttenuate(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert preattentuated ARGB to unattenuated ARGB.
LIBYUV_API
int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBUnattenuate(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Internal function - do not call directly.
// Computes table of cumulative sum for image where the value is the sum
// of all values above and to the left of the entry. Used by ARGBBlur.
LIBYUV_API
int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
int32* dst_cumsum, int dst_stride32_cumsum,
int width, int height);
int ARGBComputeCumulativeSum(const uint8* src_argb,
int src_stride_argb,
int32* dst_cumsum,
int dst_stride32_cumsum,
int width,
int height);
// Blur ARGB image.
// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
@ -432,49 +645,79 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5.
// Blur is optimized for radius of 5 (11x11) or less.
LIBYUV_API
int ARGBBlur(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int32* dst_cumsum, int dst_stride32_cumsum,
int width, int height, int radius);
int ARGBBlur(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int32* dst_cumsum,
int dst_stride32_cumsum,
int width,
int height,
int radius);
// Multiply ARGB image by ARGB value.
LIBYUV_API
int ARGBShade(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height, uint32 value);
int ARGBShade(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height,
uint32 value);
// Interpolate between two images using specified amount of interpolation
// (0 to 255) and store to destination.
// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
// and 255 means 1% src0 and 99% src1.
LIBYUV_API
int InterpolatePlane(const uint8* src0, int src_stride0,
const uint8* src1, int src_stride1,
uint8* dst, int dst_stride,
int width, int height, int interpolation);
int InterpolatePlane(const uint8* src0,
int src_stride0,
const uint8* src1,
int src_stride1,
uint8* dst,
int dst_stride,
int width,
int height,
int interpolation);
// Interpolate between two ARGB images using specified amount of interpolation
// Internally calls InterpolatePlane with width * 4 (bpp).
LIBYUV_API
int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height, int interpolation);
int ARGBInterpolate(const uint8* src_argb0,
int src_stride_argb0,
const uint8* src_argb1,
int src_stride_argb1,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height,
int interpolation);
// Interpolate between two YUV images using specified amount of interpolation
// Internally calls InterpolatePlane on each plane where the U and V planes
// are half width and half height.
LIBYUV_API
int I420Interpolate(const uint8* src0_y, int src0_stride_y,
const uint8* src0_u, int src0_stride_u,
const uint8* src0_v, int src0_stride_v,
const uint8* src1_y, int src1_stride_y,
const uint8* src1_u, int src1_stride_u,
const uint8* src1_v, int src1_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height, int interpolation);
int I420Interpolate(const uint8* src0_y,
int src0_stride_y,
const uint8* src0_u,
int src0_stride_u,
const uint8* src0_v,
int src0_stride_v,
const uint8* src1_y,
int src1_stride_y,
const uint8* src1_u,
int src1_stride_u,
const uint8* src1_v,
int src1_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height,
int interpolation);
#if defined(__pnacl__) || defined(__CLR_VER) || \
(defined(__i386__) && !defined(__SSE2__))
@ -495,36 +738,55 @@ int I420Interpolate(const uint8* src0_y, int src0_stride_y,
// Row function for copying pixels from a source with a slope to a row
// of destination. Useful for scaling, rotation, mirror, texture mapping.
LIBYUV_API
void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width);
void ARGBAffineRow_C(const uint8* src_argb,
int src_argb_stride,
uint8* dst_argb,
const float* uv_dudv,
int width);
LIBYUV_API
void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width);
void ARGBAffineRow_SSE2(const uint8* src_argb,
int src_argb_stride,
uint8* dst_argb,
const float* uv_dudv,
int width);
// Shuffle ARGB channel order. e.g. BGRA to ARGB.
// shuffler is 16 bytes and must be aligned.
LIBYUV_API
int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_argb, int dst_stride_argb,
const uint8* shuffler, int width, int height);
int ARGBShuffle(const uint8* src_bgra,
int src_stride_bgra,
uint8* dst_argb,
int dst_stride_argb,
const uint8* shuffler,
int width,
int height);
// Sobel ARGB effect with planar output.
LIBYUV_API
int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
int width, int height);
int ARGBSobelToPlane(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
int width,
int height);
// Sobel ARGB effect.
LIBYUV_API
int ARGBSobel(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBSobel(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
LIBYUV_API
int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBSobelXY(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
#ifdef __cplusplus
} // extern "C"

View File

@ -20,8 +20,8 @@ extern "C" {
// Supported rotation.
typedef enum RotationMode {
kRotate0 = 0, // No rotation.
kRotate90 = 90, // Rotate 90 degrees clockwise.
kRotate0 = 0, // No rotation.
kRotate90 = 90, // Rotate 90 degrees clockwise.
kRotate180 = 180, // Rotate 180 degrees.
kRotate270 = 270, // Rotate 270 degrees clockwise.
@ -33,81 +33,128 @@ typedef enum RotationMode {
// Rotate I420 frame.
LIBYUV_API
int I420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int src_width, int src_height, enum RotationMode mode);
int I420Rotate(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int src_width,
int src_height,
enum RotationMode mode);
// Rotate NV12 input and store in I420.
LIBYUV_API
int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int src_width, int src_height, enum RotationMode mode);
int NV12ToI420Rotate(const uint8* src_y,
int src_stride_y,
const uint8* src_uv,
int src_stride_uv,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int src_width,
int src_height,
enum RotationMode mode);
// Rotate a plane by 0, 90, 180, or 270.
LIBYUV_API
int RotatePlane(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int src_width, int src_height, enum RotationMode mode);
int RotatePlane(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int src_width,
int src_height,
enum RotationMode mode);
// Rotate planes by 90, 180, 270. Deprecated.
LIBYUV_API
void RotatePlane90(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height);
void RotatePlane90(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height);
LIBYUV_API
void RotatePlane180(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height);
void RotatePlane180(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height);
LIBYUV_API
void RotatePlane270(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height);
void RotatePlane270(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height);
LIBYUV_API
void RotateUV90(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height);
void RotateUV90(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height);
// Rotations for when U and V are interleaved.
// These functions take one input pointer and
// split the data into two buffers while
// rotating them. Deprecated.
LIBYUV_API
void RotateUV180(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height);
void RotateUV180(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height);
LIBYUV_API
void RotateUV270(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height);
void RotateUV270(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height);
// The 90 and 270 functions are based on transposes.
// Doing a transpose with reversing the read/write
// order will result in a rotation by +- 90 degrees.
// Deprecated.
LIBYUV_API
void TransposePlane(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height);
void TransposePlane(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height);
LIBYUV_API
void TransposeUV(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height);
void TransposeUV(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height);
#ifdef __cplusplus
} // extern "C"

View File

@ -21,9 +21,13 @@ extern "C" {
// Rotate ARGB frame
LIBYUV_API
int ARGBRotate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int src_width, int src_height, enum RotationMode mode);
int ARGBRotate(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int src_width,
int src_height,
enum RotationMode mode);
#ifdef __cplusplus
} // extern "C"

View File

@ -36,7 +36,8 @@ extern "C" {
// The following are available for GCC 32 or 64 bit but not NaCL for 64 bit:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
(defined(__i386__) || \
(defined(__x86_64__) && !defined(__native_client__)))
#define HAS_TRANSPOSEWX8_SSSE3
#endif
@ -54,64 +55,129 @@ extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
defined(__mips__) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2)
defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_TRANSPOSEWX8_DSPR2
#define HAS_TRANSPOSEUVWX8_DSPR2
#endif // defined(__mips__)
void TransposeWxH_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height);
void TransposeWxH_C(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height);
void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_C(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx8_NEON(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx8_SSSE3(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx8_Fast_SSSE3(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx8_DSPR2(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx8_Fast_DSPR2(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Any_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Any_NEON(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx8_Any_SSSE3(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx8_Fast_Any_SSSE3(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx8_Any_DSPR2(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeUVWxH_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height);
void TransposeUVWxH_C(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height);
void TransposeUVWx8_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_C(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width);
void TransposeUVWx8_SSE2(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width);
void TransposeUVWx8_NEON(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width);
void TransposeUVWx8_DSPR2(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width);
void TransposeUVWx8_Any_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_Any_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_Any_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_Any_SSE2(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width);
void TransposeUVWx8_Any_NEON(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width);
void TransposeUVWx8_Any_DSPR2(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width);
#ifdef __cplusplus
} // extern "C"

File diff suppressed because it is too large Load Diff

View File

@ -20,25 +20,33 @@ extern "C" {
// Supported filtering.
typedef enum FilterMode {
kFilterNone = 0, // Point sample; Fastest.
kFilterLinear = 1, // Filter horizontally only.
kFilterNone = 0, // Point sample; Fastest.
kFilterLinear = 1, // Filter horizontally only.
kFilterBilinear = 2, // Faster than box, but lower quality scaling down.
kFilterBox = 3 // Highest quality.
kFilterBox = 3 // Highest quality.
} FilterModeEnum;
// Scale a YUV plane.
LIBYUV_API
void ScalePlane(const uint8* src, int src_stride,
int src_width, int src_height,
uint8* dst, int dst_stride,
int dst_width, int dst_height,
void ScalePlane(const uint8* src,
int src_stride,
int src_width,
int src_height,
uint8* dst,
int dst_stride,
int dst_width,
int dst_height,
enum FilterMode filtering);
LIBYUV_API
void ScalePlane_16(const uint16* src, int src_stride,
int src_width, int src_height,
uint16* dst, int dst_stride,
int dst_width, int dst_height,
void ScalePlane_16(const uint16* src,
int src_stride,
int src_width,
int src_height,
uint16* dst,
int dst_stride,
int dst_width,
int dst_height,
enum FilterMode filtering);
// Scales a YUV 4:2:0 image from the src width and height to the
@ -52,42 +60,73 @@ void ScalePlane_16(const uint16* src, int src_stride,
// Returns 0 if successful.
LIBYUV_API
int I420Scale(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
int src_width, int src_height,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int dst_width, int dst_height,
int I420Scale(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
int src_width,
int src_height,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int dst_width,
int dst_height,
enum FilterMode filtering);
LIBYUV_API
int I420Scale_16(const uint16* src_y, int src_stride_y,
const uint16* src_u, int src_stride_u,
const uint16* src_v, int src_stride_v,
int src_width, int src_height,
uint16* dst_y, int dst_stride_y,
uint16* dst_u, int dst_stride_u,
uint16* dst_v, int dst_stride_v,
int dst_width, int dst_height,
int I420Scale_16(const uint16* src_y,
int src_stride_y,
const uint16* src_u,
int src_stride_u,
const uint16* src_v,
int src_stride_v,
int src_width,
int src_height,
uint16* dst_y,
int dst_stride_y,
uint16* dst_u,
int dst_stride_u,
uint16* dst_v,
int dst_stride_v,
int dst_width,
int dst_height,
enum FilterMode filtering);
#ifdef __cplusplus
// Legacy API. Deprecated.
LIBYUV_API
int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
int src_stride_y, int src_stride_u, int src_stride_v,
int src_width, int src_height,
uint8* dst_y, uint8* dst_u, uint8* dst_v,
int dst_stride_y, int dst_stride_u, int dst_stride_v,
int dst_width, int dst_height,
int Scale(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
int src_stride_y,
int src_stride_u,
int src_stride_v,
int src_width,
int src_height,
uint8* dst_y,
uint8* dst_u,
uint8* dst_v,
int dst_stride_y,
int dst_stride_u,
int dst_stride_v,
int dst_width,
int dst_height,
LIBYUV_BOOL interpolate);
// Legacy API. Deprecated.
LIBYUV_API
int ScaleOffset(const uint8* src_i420, int src_width, int src_height,
uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,
int ScaleOffset(const uint8* src_i420,
int src_width,
int src_height,
uint8* dst_i420,
int dst_width,
int dst_height,
int dst_yoffset,
LIBYUV_BOOL interpolate);
// For testing, allow disabling of specialized scalers.

View File

@ -20,32 +20,52 @@ extern "C" {
#endif
LIBYUV_API
int ARGBScale(const uint8* src_argb, int src_stride_argb,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
int dst_width, int dst_height,
int ARGBScale(const uint8* src_argb,
int src_stride_argb,
int src_width,
int src_height,
uint8* dst_argb,
int dst_stride_argb,
int dst_width,
int dst_height,
enum FilterMode filtering);
// Clipped scale takes destination rectangle coordinates for clip values.
LIBYUV_API
int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
int dst_width, int dst_height,
int clip_x, int clip_y, int clip_width, int clip_height,
int ARGBScaleClip(const uint8* src_argb,
int src_stride_argb,
int src_width,
int src_height,
uint8* dst_argb,
int dst_stride_argb,
int dst_width,
int dst_height,
int clip_x,
int clip_y,
int clip_width,
int clip_height,
enum FilterMode filtering);
// Scale with YUV conversion to ARGB and clipping.
LIBYUV_API
int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
int YUVToARGBScaleClip(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint32 src_fourcc,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
int src_width,
int src_height,
uint8* dst_argb,
int dst_stride_argb,
uint32 dst_fourcc,
int dst_width, int dst_height,
int clip_x, int clip_y, int clip_width, int clip_height,
int dst_width,
int dst_height,
int clip_x,
int clip_y,
int clip_width,
int clip_height,
enum FilterMode filtering);
#ifdef __cplusplus

View File

@ -45,8 +45,8 @@ extern "C" {
#endif // __clang__
// Visual C 2012 required for AVX2.
#if defined(_M_IX86) && !defined(__clang__) && \
defined(_MSC_VER) && _MSC_VER >= 1700
#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
_MSC_VER >= 1700
#define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012
@ -72,8 +72,9 @@ extern "C" {
// The following are available on all x86 platforms, but
// require VS2012, clang 3.4 or gcc 4.7.
// The code supports NaCL but requires a new compiler and validator.
#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
defined(GCC_HAS_AVX2))
#define HAS_SCALEADDROW_AVX2
#define HAS_SCALEROWDOWN2_AVX2
#define HAS_SCALEROWDOWN4_AVX2
@ -104,22 +105,36 @@ extern "C" {
// Scale ARGB vertically with bilinear interpolation.
void ScalePlaneVertical(int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int y, int dy,
int bpp, enum FilterMode filtering);
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8* src_argb,
uint8* dst_argb,
int x,
int y,
int dy,
int bpp,
enum FilterMode filtering);
void ScalePlaneVertical_16(int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint16* src_argb, uint16* dst_argb,
int x, int y, int dy,
int wpp, enum FilterMode filtering);
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint16* src_argb,
uint16* dst_argb,
int x,
int y,
int dy,
int wpp,
enum FilterMode filtering);
// Simplify the filtering based on scale factors.
enum FilterMode ScaleFilterReduce(int src_width, int src_height,
int dst_width, int dst_height,
enum FilterMode ScaleFilterReduce(int src_width,
int src_height,
int dst_width,
int dst_height,
enum FilterMode filtering);
// Divide num by div and return as 16.16 fixed point result.
@ -137,363 +152,647 @@ int FixedDiv1_X86(int num, int div);
#endif
// Compute slope values for stepping.
void ScaleSlope(int src_width, int src_height,
int dst_width, int dst_height,
void ScaleSlope(int src_width,
int src_height,
int dst_width,
int dst_height,
enum FilterMode filtering,
int* x, int* y, int* dx, int* dy);
int* x,
int* y,
int* dx,
int* dy);
void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* d, int dst_width);
void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* d, int dst_width);
void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx);
void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int, int);
void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int, int);
void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx);
void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx);
void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown2_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown2_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width);
void ScaleRowDown2Linear_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown2Linear_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width);
void ScaleRowDown2Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown2Box_Odd_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown2Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width);
void ScaleRowDown4_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown4_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width);
void ScaleRowDown4Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown4Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width);
void ScaleRowDown34_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown34_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width);
void ScaleRowDown34_0_Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* d,
int dst_width);
void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* d,
int dst_width);
void ScaleRowDown34_1_Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* d,
int dst_width);
void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* d,
int dst_width);
void ScaleCols_C(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx);
void ScaleCols_16_C(uint16* dst_ptr,
const uint16* src_ptr,
int dst_width,
int x,
int dx);
void ScaleColsUp2_C(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int,
int);
void ScaleColsUp2_16_C(uint16* dst_ptr,
const uint16* src_ptr,
int dst_width,
int,
int);
void ScaleFilterCols_C(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx);
void ScaleFilterCols_16_C(uint16* dst_ptr,
const uint16* src_ptr,
int dst_width,
int x,
int dx);
void ScaleFilterCols64_C(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx);
void ScaleFilterCols64_16_C(uint16* dst_ptr,
const uint16* src_ptr,
int dst_width,
int x,
int dx);
void ScaleRowDown38_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown38_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width);
void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
uint8* dst_ptr,
int dst_width);
void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width);
void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width);
uint16* dst_ptr,
int dst_width);
void ScaleRowDown38_2_Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst_ptr,
int dst_width);
void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
void ScaleARGBRowDown2_C(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDown2Box_C(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDownEven_C(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int, int);
void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
uint8* dst_argb,
int dst_width);
void ScaleARGBCols_C(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx);
void ScaleARGBCols64_C(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx);
void ScaleARGBColsUp2_C(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int,
int);
void ScaleARGBFilterCols_C(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx);
void ScaleARGBFilterCols64_C(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx);
// Specialized scalers for x86.
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown2_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown4_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown4_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
uint8* dst_ptr,
int dst_width);
void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
uint8* dst_ptr,
int dst_width);
void ScaleRowDown38_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
uint8* dst_ptr,
int dst_width);
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
uint8* dst_ptr,
int dst_width);
void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown2_Any_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown4_Any_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
uint8* dst_ptr,
int dst_width);
void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
uint8* dst_ptr,
int dst_width);
void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
uint8* dst_ptr,
int dst_width);
void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
uint8* dst_ptr,
int dst_width);
void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleFilterCols_SSSE3(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx);
void ScaleColsUp2_SSE2(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx);
// ARGB Column functions
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBCols_SSE2(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx);
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx);
void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx);
void ScaleARGBFilterCols_NEON(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx);
void ScaleARGBCols_NEON(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx);
void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx);
void ScaleARGBCols_Any_NEON(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx);
// ARGB Row functions
void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
uint8* dst_argb,
int dst_width);
void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
uint8* dst_argb,
int dst_width);
// ScaleRowDown2Box also used by planar functions
// NEON downscalers with interpolation.
// Note - not static due to reuse in convert for 444 to 420.
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown2Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown4Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
// to load up the every 4th pixel into a 4 different registers.
// Point samples 32 pixels to 24 pixels.
void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
uint8* dst_ptr,
int dst_width);
void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
uint8* dst_ptr,
int dst_width);
void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
uint8* dst_ptr,
int dst_width);
// 32 -> 12
void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
uint8* dst_ptr,
int dst_width);
// 32x3 -> 12x1
void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
uint8* dst_ptr,
int dst_width);
// 32x2 -> 12x1
void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
uint8* dst_ptr,
int dst_width);
void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown4_Any_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown34_Any_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
// 32 -> 12
void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_Any_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
// 32x3 -> 12x1
void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
// 32x2 -> 12x1
void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleFilterCols_NEON(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx);
void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleFilterCols_Any_NEON(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx);
void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown2Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown4_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown4Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown34_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* d,
int dst_width);
void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* d,
int dst_width);
void ScaleRowDown38_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width);
void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
#ifdef __cplusplus
} // extern "C"

View File

@ -28,13 +28,13 @@ extern "C" {
// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
// constants are used in a switch.
#ifdef __cplusplus
#define FOURCC(a, b, c, d) ( \
(static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
(static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
#define FOURCC(a, b, c, d) \
((static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
(static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
#else
#define FOURCC(a, b, c, d) ( \
((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
((uint32)(c) << 16) | ((uint32)(d) << 24)) /* NOLINT */
#define FOURCC(a, b, c, d) \
(((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
((uint32)(c) << 16) | ((uint32)(d) << 24)) /* NOLINT */
#endif
// Some pages discussing FourCC codes:
@ -69,7 +69,7 @@ enum FourCC {
FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE.
FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE.
@ -137,7 +137,7 @@ enum FourCCBpp {
FOURCC_BPP_ABGR = 32,
FOURCC_BPP_RGBA = 32,
FOURCC_BPP_24BG = 24,
FOURCC_BPP_RAW = 24,
FOURCC_BPP_RAW = 24,
FOURCC_BPP_RGBP = 16,
FOURCC_BPP_RGBO = 16,
FOURCC_BPP_R444 = 16,
@ -170,7 +170,7 @@ enum FourCCBpp {
FOURCC_BPP_CM24 = 24,
// Match any fourcc.
FOURCC_BPP_ANY = 0, // 0 means unknown.
FOURCC_BPP_ANY = 0, // 0 means unknown.
};
// Converts fourcc aliases into canonical ones.

View File

@ -32,8 +32,7 @@ LIBYUV_API
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
const int kBlockSize = 1 << 15; // 32768;
int remainder;
uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
HashDjb2_C;
uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
#if defined(HAS_HASHDJB2_SSE41)
if (TestCpuFlag(kCpuHasSSE41)) {
HashDjb2_SSE = HashDjb2_SSE41;
@ -50,13 +49,13 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
src += kBlockSize;
count -= kBlockSize;
}
remainder = (int)(count) & ~15;
remainder = (int)count & ~15;
if (remainder) {
seed = HashDjb2_SSE(src, remainder, seed);
src += remainder;
count -= remainder;
}
remainder = (int)(count) & 15;
remainder = (int)count & 15;
if (remainder) {
seed = HashDjb2_C(src, remainder, seed);
}
@ -113,7 +112,8 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
// TODO(fbarchard): Refactor into row function.
LIBYUV_API
uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
uint64 ComputeSumSquareError(const uint8* src_a,
const uint8* src_b,
int count) {
// SumSquareError returns values 0 to 65535 for each squared difference.
// Up to 65536 of those can be summed and remain within a uint32.
@ -142,7 +142,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
}
#endif
#ifdef _OPENMP
#pragma omp parallel for reduction(+: sse)
#pragma omp parallel for reduction(+ : sse)
#endif
for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
@ -162,14 +162,16 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
}
LIBYUV_API
uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height) {
uint64 ComputeSumSquareErrorPlane(const uint8* src_a,
int stride_a,
const uint8* src_b,
int stride_b,
int width,
int height) {
uint64 sse = 0;
int h;
// Coalesce rows.
if (stride_a == width &&
stride_b == width) {
if (stride_a == width && stride_b == width) {
width *= height;
height = 1;
stride_a = stride_b = 0;
@ -186,10 +188,10 @@ LIBYUV_API
double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
double psnr;
if (sse > 0) {
double mse = (double)(count) / (double)(sse);
double mse = (double)count / (double)sse;
psnr = 10.0 * log10(255.0 * 255.0 * mse);
} else {
psnr = kMaxPsnr; // Limit to prevent divide by 0
psnr = kMaxPsnr; // Limit to prevent divide by 0
}
if (psnr > kMaxPsnr)
@ -199,45 +201,53 @@ double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
}
LIBYUV_API
double CalcFramePsnr(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height) {
double CalcFramePsnr(const uint8* src_a,
int stride_a,
const uint8* src_b,
int stride_b,
int width,
int height) {
const uint64 samples = width * height;
const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
src_b, stride_b,
width, height);
const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b,
stride_b, width, height);
return SumSquareErrorToPsnr(sse, samples);
}
LIBYUV_API
double I420Psnr(const uint8* src_y_a, int stride_y_a,
const uint8* src_u_a, int stride_u_a,
const uint8* src_v_a, int stride_v_a,
const uint8* src_y_b, int stride_y_b,
const uint8* src_u_b, int stride_u_b,
const uint8* src_v_b, int stride_v_b,
int width, int height) {
const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
src_y_b, stride_y_b,
width, height);
double I420Psnr(const uint8* src_y_a,
int stride_y_a,
const uint8* src_u_a,
int stride_u_a,
const uint8* src_v_a,
int stride_v_a,
const uint8* src_y_b,
int stride_y_b,
const uint8* src_u_b,
int stride_u_b,
const uint8* src_v_b,
int stride_v_b,
int width,
int height) {
const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a, src_y_b,
stride_y_b, width, height);
const int width_uv = (width + 1) >> 1;
const int height_uv = (height + 1) >> 1;
const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
src_u_b, stride_u_b,
width_uv, height_uv);
const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
src_v_b, stride_v_b,
width_uv, height_uv);
const uint64 sse_u = ComputeSumSquareErrorPlane(
src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv);
const uint64 sse_v = ComputeSumSquareErrorPlane(
src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv);
const uint64 samples = width * height + 2 * (width_uv * height_uv);
const uint64 sse = sse_y + sse_u + sse_v;
return SumSquareErrorToPsnr(sse, samples);
}
static const int64 cc1 = 26634; // (64^2*(.01*255)^2
static const int64 cc1 = 26634; // (64^2*(.01*255)^2
static const int64 cc2 = 239708; // (64^2*(.03*255)^2
static double Ssim8x8_C(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b) {
static double Ssim8x8_C(const uint8* src_a,
int stride_a,
const uint8* src_b,
int stride_b) {
int64 sum_a = 0;
int64 sum_b = 0;
int64 sum_sq_a = 0;
@ -270,12 +280,12 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a,
const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
(2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
const int64 sum_a_sq = sum_a*sum_a;
const int64 sum_b_sq = sum_b*sum_b;
const int64 sum_a_sq = sum_a * sum_a;
const int64 sum_b_sq = sum_b * sum_b;
const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
(count * sum_sq_a - sum_a_sq +
count * sum_sq_b - sum_b_sq + c2);
const int64 ssim_d =
(sum_a_sq + sum_b_sq + c1) *
(count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);
if (ssim_d == 0.0) {
return DBL_MAX;
@ -288,13 +298,16 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a,
// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
// block boundaries to penalize blocking artifacts.
LIBYUV_API
double CalcFrameSsim(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height) {
double CalcFrameSsim(const uint8* src_a,
int stride_a,
const uint8* src_b,
int stride_b,
int width,
int height) {
int samples = 0;
double ssim_total = 0;
double (*Ssim8x8)(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b) = Ssim8x8_C;
double (*Ssim8x8)(const uint8* src_a, int stride_a, const uint8* src_b,
int stride_b) = Ssim8x8_C;
// sample point start with each 4x4 location
int i;
@ -314,22 +327,27 @@ double CalcFrameSsim(const uint8* src_a, int stride_a,
}
LIBYUV_API
double I420Ssim(const uint8* src_y_a, int stride_y_a,
const uint8* src_u_a, int stride_u_a,
const uint8* src_v_a, int stride_v_a,
const uint8* src_y_b, int stride_y_b,
const uint8* src_u_b, int stride_u_b,
const uint8* src_v_b, int stride_v_b,
int width, int height) {
const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
src_y_b, stride_y_b, width, height);
double I420Ssim(const uint8* src_y_a,
int stride_y_a,
const uint8* src_u_a,
int stride_u_a,
const uint8* src_v_a,
int stride_v_a,
const uint8* src_y_b,
int stride_y_b,
const uint8* src_u_b,
int stride_u_b,
const uint8* src_v_b,
int stride_v_b,
int width,
int height) {
const double ssim_y =
CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
const int width_uv = (width + 1) >> 1;
const int height_uv = (height + 1) >> 1;
const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
src_u_b, stride_u_b,
const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b,
width_uv, height_uv);
const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
src_v_b, stride_v_b,
const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b,
width_uv, height_uv);
return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
}

View File

@ -62,30 +62,30 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
return sse;
}
static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
static uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
static uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
};
static uvec32 kHashMul1 = {
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
};
static uvec32 kHashMul2 = {
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
};
static uvec32 kHashMul3 = {
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
};
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
@ -148,4 +148,3 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
} // extern "C"
} // namespace libyuv
#endif

View File

@ -21,12 +21,12 @@ extern "C" {
// This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
__declspec(naked)
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
__declspec(naked) uint32
SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
__asm {
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
mov ecx, [esp + 12] // count
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
mov ecx, [esp + 12] // count
pxor xmm0, xmm0
pxor xmm5, xmm5
@ -61,13 +61,13 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
// Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
#pragma warning(disable: 4752)
__declspec(naked)
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
#pragma warning(disable : 4752)
__declspec(naked) uint32
SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
__asm {
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
mov ecx, [esp + 12] // count
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
mov ecx, [esp + 12] // count
vpxor ymm0, ymm0, ymm0 // sum
vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
sub edx, eax
@ -101,65 +101,65 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
}
#endif // _MSC_VER >= 1700
uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
};
uvec32 kHashMul1 = {
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
};
uvec32 kHashMul2 = {
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
};
uvec32 kHashMul3 = {
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
};
__declspec(naked)
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
__declspec(naked) uint32
HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
movd xmm0, [esp + 12] // seed
pxor xmm7, xmm7 // constant 0 for unpck
pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm6, xmmword ptr kHash16x33
wloop:
movdqu xmm1, [eax] // src[0-15]
movdqu xmm1, [eax] // src[0-15]
lea eax, [eax + 16]
pmulld xmm0, xmm6 // hash *= 33 ^ 16
pmulld xmm0, xmm6 // hash *= 33 ^ 16
movdqa xmm5, xmmword ptr kHashMul0
movdqa xmm2, xmm1
punpcklbw xmm2, xmm7 // src[0-7]
punpcklbw xmm2, xmm7 // src[0-7]
movdqa xmm3, xmm2
punpcklwd xmm3, xmm7 // src[0-3]
punpcklwd xmm3, xmm7 // src[0-3]
pmulld xmm3, xmm5
movdqa xmm5, xmmword ptr kHashMul1
movdqa xmm4, xmm2
punpckhwd xmm4, xmm7 // src[4-7]
punpckhwd xmm4, xmm7 // src[4-7]
pmulld xmm4, xmm5
movdqa xmm5, xmmword ptr kHashMul2
punpckhbw xmm1, xmm7 // src[8-15]
punpckhbw xmm1, xmm7 // src[8-15]
movdqa xmm2, xmm1
punpcklwd xmm2, xmm7 // src[8-11]
punpcklwd xmm2, xmm7 // src[8-11]
pmulld xmm2, xmm5
movdqa xmm5, xmmword ptr kHashMul3
punpckhwd xmm1, xmm7 // src[12-15]
punpckhwd xmm1, xmm7 // src[12-15]
pmulld xmm1, xmm5
paddd xmm3, xmm4 // add 16 results
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
paddd xmm1, xmm3
@ -171,18 +171,18 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
sub ecx, 16
jg wloop
movd eax, xmm0 // return hash
movd eax, xmm0 // return hash
ret
}
}
// Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700
__declspec(naked)
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
__declspec(naked) uint32
HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
vmovd xmm0, [esp + 12] // seed
wloop:
@ -196,7 +196,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
vpmulld xmm2, xmm2, xmmword ptr kHashMul2
lea eax, [eax + 16]
vpmulld xmm1, xmm1, xmmword ptr kHashMul3
vpaddd xmm3, xmm3, xmm4 // add 16 results
vpaddd xmm3, xmm3, xmm4 // add 16 results
vpaddd xmm1, xmm1, xmm2
vpaddd xmm1, xmm1, xmm3
vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
@ -207,7 +207,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
sub ecx, 16
jg wloop
vmovd eax, xmm0 // return hash
vmovd eax, xmm0 // return hash
vzeroupper
ret
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -30,87 +30,100 @@ static __inline int Abs(int v) {
}
// I420 To any I4xx YUV format with mirroring.
static int I420ToI4xx(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int src_y_width, int src_y_height,
int dst_uv_width, int dst_uv_height) {
static int I420ToI4xx(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int src_y_width,
int src_y_height,
int dst_uv_width,
int dst_uv_height) {
const int dst_y_width = Abs(src_y_width);
const int dst_y_height = Abs(src_y_height);
const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
if (src_y_width == 0 || src_y_height == 0 ||
dst_uv_width <= 0 || dst_uv_height <= 0) {
if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 ||
dst_uv_height <= 0) {
return -1;
}
if (dst_y) {
ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
dst_y, dst_stride_y, dst_y_width, dst_y_height,
kFilterBilinear);
ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
}
ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
kFilterBilinear);
ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
kFilterBilinear);
ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
return 0;
}
// 420 chroma is 1/2 width, 1/2 height
// 422 chroma is 1/2 width, 1x height
LIBYUV_API
int I420ToI422(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
int I420ToI422(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height) {
const int dst_uv_width = (Abs(width) + 1) >> 1;
const int dst_uv_height = Abs(height);
return I420ToI4xx(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_y, dst_stride_y,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
width, height,
dst_uv_width, dst_uv_height);
return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
dst_v, dst_stride_v, width, height, dst_uv_width,
dst_uv_height);
}
// 420 chroma is 1/2 width, 1/2 height
// 444 chroma is 1x width, 1x height
LIBYUV_API
int I420ToI444(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
int I420ToI444(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height) {
const int dst_uv_width = Abs(width);
const int dst_uv_height = Abs(height);
return I420ToI4xx(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_y, dst_stride_y,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
width, height,
dst_uv_width, dst_uv_height);
return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
dst_v, dst_stride_v, width, height, dst_uv_width,
dst_uv_height);
}
// Copy to I400. Source can be I420,422,444,400,NV12,NV21
LIBYUV_API
int I400Copy(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height) {
if (!src_y || !dst_y ||
width <= 0 || height == 0) {
int I400Copy(const uint8* src_y,
int src_stride_y,
uint8* dst_y,
int dst_stride_y,
int width,
int height) {
if (!src_y || !dst_y || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@ -124,17 +137,21 @@ int I400Copy(const uint8* src_y, int src_stride_y,
}
LIBYUV_API
int I422ToYUY2(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_yuy2, int dst_stride_yuy2,
int width, int height) {
int I422ToYUY2(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_yuy2,
int dst_stride_yuy2,
int width,
int height) {
int y;
void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_yuy2, int width) =
I422ToYUY2Row_C;
if (!src_y || !src_u || !src_v || !dst_yuy2 ||
width <= 0 || height == 0) {
if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@ -144,10 +161,8 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
dst_stride_yuy2 = -dst_stride_yuy2;
}
// Coalesce rows.
if (src_stride_y == width &&
src_stride_u * 2 == width &&
src_stride_v * 2 == width &&
dst_stride_yuy2 == width * 2) {
if (src_stride_y == width && src_stride_u * 2 == width &&
src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) {
width *= height;
height = 1;
src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
@ -180,17 +195,21 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
}
LIBYUV_API
int I420ToYUY2(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_yuy2, int dst_stride_yuy2,
int width, int height) {
int I420ToYUY2(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_yuy2,
int dst_stride_yuy2,
int width,
int height) {
int y;
void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_yuy2, int width) =
I422ToYUY2Row_C;
if (!src_y || !src_u || !src_v || !dst_yuy2 ||
width <= 0 || height == 0) {
if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@ -240,17 +259,21 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
}
LIBYUV_API
int I422ToUYVY(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_uyvy, int dst_stride_uyvy,
int width, int height) {
int I422ToUYVY(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_uyvy,
int dst_stride_uyvy,
int width,
int height) {
int y;
void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_uyvy, int width) =
I422ToUYVYRow_C;
if (!src_y || !src_u || !src_v || !dst_uyvy ||
width <= 0 || height == 0) {
if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@ -260,10 +283,8 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
dst_stride_uyvy = -dst_stride_uyvy;
}
// Coalesce rows.
if (src_stride_y == width &&
src_stride_u * 2 == width &&
src_stride_v * 2 == width &&
dst_stride_uyvy == width * 2) {
if (src_stride_y == width && src_stride_u * 2 == width &&
src_stride_v * 2 == width && dst_stride_uyvy == width * 2) {
width *= height;
height = 1;
src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
@ -304,17 +325,21 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
}
LIBYUV_API
int I420ToUYVY(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_uyvy, int dst_stride_uyvy,
int width, int height) {
int I420ToUYVY(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_uyvy,
int dst_stride_uyvy,
int width,
int height) {
int y;
void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_uyvy, int width) =
I422ToUYVYRow_C;
if (!src_y || !src_u || !src_v || !dst_uyvy ||
width <= 0 || height == 0) {
if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@ -365,14 +390,20 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
// TODO(fbarchard): test negative height for invert.
LIBYUV_API
int I420ToNV12(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height) {
if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
width <= 0 || height == 0) {
int I420ToNV12(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_uv,
int dst_stride_uv,
int width,
int height) {
if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
height == 0) {
return -1;
}
int halfwidth = (width + 1) / 2;
@ -380,44 +411,47 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
if (dst_y) {
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
MergeUVPlane(src_u, src_stride_u,
src_v, src_stride_v,
dst_uv, dst_stride_uv,
MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,
halfwidth, halfheight);
return 0;
}
LIBYUV_API
int I420ToNV21(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_vu, int dst_stride_vu,
int width, int height) {
return I420ToNV12(src_y, src_stride_y,
src_v, src_stride_v,
src_u, src_stride_u,
dst_y, dst_stride_y,
dst_vu, dst_stride_vu,
int I420ToNV21(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_vu,
int dst_stride_vu,
int width,
int height) {
return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
width, height);
}
// Convert I422 to RGBA with matrix
static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgba, int dst_stride_rgba,
static int I420ToRGBAMatrix(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_rgba,
int dst_stride_rgba,
const struct YuvConstants* yuvconstants,
int width, int height) {
int width,
int height) {
int y;
void (*I422ToRGBARow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToRGBARow_C;
if (!src_y || !src_u || !src_v || !dst_rgba ||
width <= 0 || height == 0) {
void (*I422ToRGBARow)(const uint8* y_buf, const uint8* u_buf,
const uint8* v_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
I422ToRGBARow_C;
if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@ -482,50 +516,58 @@ static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
// Convert I420 to RGBA.
LIBYUV_API
int I420ToRGBA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgba, int dst_stride_rgba,
int width, int height) {
return I420ToRGBAMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_rgba, dst_stride_rgba,
&kYuvI601Constants,
width, height);
int I420ToRGBA(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_rgba,
int dst_stride_rgba,
int width,
int height) {
return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_rgba, dst_stride_rgba,
&kYuvI601Constants, width, height);
}
// Convert I420 to BGRA.
LIBYUV_API
int I420ToBGRA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_bgra, int dst_stride_bgra,
int width, int height) {
return I420ToRGBAMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_bgra, dst_stride_bgra,
int I420ToBGRA(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_bgra,
int dst_stride_bgra,
int width,
int height) {
return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
src_stride_v, // Swap U and V
src_u, src_stride_u, dst_bgra, dst_stride_bgra,
&kYvuI601Constants, // Use Yvu matrix
width, height);
}
// Convert I420 to RGB24 with matrix
static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgb24, int dst_stride_rgb24,
static int I420ToRGB24Matrix(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_rgb24,
int dst_stride_rgb24,
const struct YuvConstants* yuvconstants,
int width, int height) {
int width,
int height) {
int y;
void (*I422ToRGB24Row)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToRGB24Row_C;
if (!src_y || !src_u || !src_v || !dst_rgb24 ||
width <= 0 || height == 0) {
void (*I422ToRGB24Row)(const uint8* y_buf, const uint8* u_buf,
const uint8* v_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
I422ToRGB24Row_C;
if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@ -581,50 +623,59 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
// Convert I420 to RGB24.
LIBYUV_API
int I420ToRGB24(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgb24, int dst_stride_rgb24,
int width, int height) {
return I420ToRGB24Matrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_rgb24, dst_stride_rgb24,
&kYuvI601Constants,
width, height);
int I420ToRGB24(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_rgb24,
int dst_stride_rgb24,
int width,
int height) {
return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_rgb24, dst_stride_rgb24,
&kYuvI601Constants, width, height);
}
// Convert I420 to RAW.
LIBYUV_API
int I420ToRAW(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_raw, int dst_stride_raw,
int width, int height) {
return I420ToRGB24Matrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_raw, dst_stride_raw,
int I420ToRAW(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_raw,
int dst_stride_raw,
int width,
int height) {
return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
src_stride_v, // Swap U and V
src_u, src_stride_u, dst_raw, dst_stride_raw,
&kYvuI601Constants, // Use Yvu matrix
width, height);
}
// Convert I420 to ARGB1555.
LIBYUV_API
int I420ToARGB1555(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb1555, int dst_stride_argb1555,
int width, int height) {
int I420ToARGB1555(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb1555,
int dst_stride_argb1555,
int width,
int height) {
int y;
void (*I422ToARGB1555Row)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
void (*I422ToARGB1555Row)(const uint8* y_buf, const uint8* u_buf,
const uint8* v_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToARGB1555Row_C;
if (!src_y || !src_u || !src_v || !dst_argb1555 ||
width <= 0 || height == 0) {
if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
height == 0) {
return -1;
}
// Negative height means invert the image.
@ -679,23 +730,25 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
return 0;
}
// Convert I420 to ARGB4444.
LIBYUV_API
int I420ToARGB4444(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb4444, int dst_stride_argb4444,
int width, int height) {
int I420ToARGB4444(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb4444,
int dst_stride_argb4444,
int width,
int height) {
int y;
void (*I422ToARGB4444Row)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
void (*I422ToARGB4444Row)(const uint8* y_buf, const uint8* u_buf,
const uint8* v_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToARGB4444Row_C;
if (!src_y || !src_u || !src_v || !dst_argb4444 ||
width <= 0 || height == 0) {
if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
height == 0) {
return -1;
}
// Negative height means invert the image.
@ -752,20 +805,22 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
// Convert I420 to RGB565.
LIBYUV_API
int I420ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height) {
int I420ToRGB565(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_rgb565,
int dst_stride_rgb565,
int width,
int height) {
int y;
void (*I422ToRGB565Row)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToRGB565Row_C;
if (!src_y || !src_u || !src_v || !dst_rgb565 ||
width <= 0 || height == 0) {
void (*I422ToRGB565Row)(const uint8* y_buf, const uint8* u_buf,
const uint8* v_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
I422ToRGB565Row_C;
if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@ -821,30 +876,31 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
static const uint8 kDither565_4x4[16] = {
0, 4, 1, 5,
6, 2, 7, 3,
1, 5, 0, 4,
7, 3, 6, 2,
0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
};
// Convert I420 to RGB565 with dithering.
LIBYUV_API
int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgb565, int dst_stride_rgb565,
const uint8* dither4x4, int width, int height) {
int I420ToRGB565Dither(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_rgb565,
int dst_stride_rgb565,
const uint8* dither4x4,
int width,
int height) {
int y;
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToARGBRow_C;
void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
const uint8* v_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
I422ToARGBRow_C;
void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
if (!src_y || !src_u || !src_v || !dst_rgb565 ||
width <= 0 || height == 0) {
const uint32 dither4, int width) =
ARGBToRGB565DitherRow_C;
if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@ -926,7 +982,8 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
ARGBToRGB565DitherRow(row_argb, dst_rgb565,
*(uint32*)(dither4x4 + ((y & 3) << 2)), width); // NOLINT
*(uint32*)(dither4x4 + ((y & 3) << 2)),
width); // NOLINT
dst_rgb565 += dst_stride_rgb565;
src_y += src_stride_y;
if (y & 1) {
@ -941,136 +998,98 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
// Convert I420 to specified format
LIBYUV_API
int ConvertFromI420(const uint8* y, int y_stride,
const uint8* u, int u_stride,
const uint8* v, int v_stride,
uint8* dst_sample, int dst_sample_stride,
int width, int height,
int ConvertFromI420(const uint8* y,
int y_stride,
const uint8* u,
int u_stride,
const uint8* v,
int v_stride,
uint8* dst_sample,
int dst_sample_stride,
int width,
int height,
uint32 fourcc) {
uint32 format = CanonicalFourCC(fourcc);
int r = 0;
if (!y || !u|| !v || !dst_sample ||
width <= 0 || height == 0) {
if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {
return -1;
}
switch (format) {
// Single plane formats
case FOURCC_YUY2:
r = I420ToYUY2(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
dst_sample_stride ? dst_sample_stride : width * 2,
width, height);
r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width * 2, width,
height);
break;
case FOURCC_UYVY:
r = I420ToUYVY(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
dst_sample_stride ? dst_sample_stride : width * 2,
width, height);
r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width * 2, width,
height);
break;
case FOURCC_RGBP:
r = I420ToRGB565(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
dst_sample_stride ? dst_sample_stride : width * 2,
width, height);
r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width * 2, width,
height);
break;
case FOURCC_RGBO:
r = I420ToARGB1555(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width * 2,
width, height);
break;
case FOURCC_R444:
r = I420ToARGB4444(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width * 2,
width, height);
break;
case FOURCC_24BG:
r = I420ToRGB24(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
dst_sample_stride ? dst_sample_stride : width * 3,
width, height);
r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width * 3, width,
height);
break;
case FOURCC_RAW:
r = I420ToRAW(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
dst_sample_stride ? dst_sample_stride : width * 3,
width, height);
r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width * 3, width,
height);
break;
case FOURCC_ARGB:
r = I420ToARGB(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
dst_sample_stride ? dst_sample_stride : width * 4,
width, height);
r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width * 4, width,
height);
break;
case FOURCC_BGRA:
r = I420ToBGRA(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
dst_sample_stride ? dst_sample_stride : width * 4,
width, height);
r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width * 4, width,
height);
break;
case FOURCC_ABGR:
r = I420ToABGR(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
dst_sample_stride ? dst_sample_stride : width * 4,
width, height);
r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width * 4, width,
height);
break;
case FOURCC_RGBA:
r = I420ToRGBA(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
dst_sample_stride ? dst_sample_stride : width * 4,
width, height);
r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width * 4, width,
height);
break;
case FOURCC_I400:
r = I400Copy(y, y_stride,
dst_sample,
dst_sample_stride ? dst_sample_stride : width,
width, height);
r = I400Copy(y, y_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width, width,
height);
break;
case FOURCC_NV12: {
uint8* dst_uv = dst_sample + width * height;
r = I420ToNV12(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
dst_sample_stride ? dst_sample_stride : width,
dst_uv,
dst_sample_stride ? dst_sample_stride : width,
width, height);
r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width, dst_uv,
dst_sample_stride ? dst_sample_stride : width, width,
height);
break;
}
case FOURCC_NV21: {
uint8* dst_vu = dst_sample + width * height;
r = I420ToNV21(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
dst_sample_stride ? dst_sample_stride : width,
dst_vu,
dst_sample_stride ? dst_sample_stride : width,
width, height);
r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width, dst_vu,
dst_sample_stride ? dst_sample_stride : width, width,
height);
break;
}
// TODO(fbarchard): Add M420.
@ -1089,13 +1108,8 @@ int ConvertFromI420(const uint8* y, int y_stride,
dst_u = dst_sample + width * height;
dst_v = dst_u + halfwidth * halfheight;
}
r = I420Copy(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample, width,
dst_u, halfwidth,
dst_v, halfwidth,
width, height);
r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample, width,
dst_u, halfwidth, dst_v, halfwidth, width, height);
break;
}
case FOURCC_I422:
@ -1110,13 +1124,8 @@ int ConvertFromI420(const uint8* y, int y_stride,
dst_u = dst_sample + width * height;
dst_v = dst_u + halfwidth * height;
}
r = I420ToI422(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample, width,
dst_u, halfwidth,
dst_v, halfwidth,
width, height);
r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample, width,
dst_u, halfwidth, dst_v, halfwidth, width, height);
break;
}
case FOURCC_I444:
@ -1130,13 +1139,8 @@ int ConvertFromI420(const uint8* y, int y_stride,
dst_u = dst_sample + width * height;
dst_v = dst_u + width * height;
}
r = I420ToI444(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample, width,
dst_u, width,
dst_v, width,
width, height);
r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample, width,
dst_u, width, dst_v, width, width, height);
break;
}
// Formats not supported - MJPG, biplanar, some rgb formats.

View File

@ -22,16 +22,21 @@ extern "C" {
// ARGB little endian (bgra in memory) to I444
LIBYUV_API
int ARGBToI444(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
int ARGBToI444(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int width) = ARGBToUV444Row_C;
int width) = ARGBToUV444Row_C;
if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
@ -41,20 +46,18 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_y == width &&
dst_stride_u == width &&
dst_stride_v == width) {
if (src_stride_argb == width * 4 && dst_stride_y == width &&
dst_stride_u == width && dst_stride_v == width) {
width *= height;
height = 1;
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
#if defined(HAS_ARGBTOUV444ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUV444Row = ARGBToUV444Row_SSSE3;
}
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUV444Row = ARGBToUV444Row_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUV444ROW_NEON)
@ -111,19 +114,22 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
// ARGB little endian (bgra in memory) to I422
LIBYUV_API
int ARGBToI422(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
int ARGBToI422(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
if (!src_argb ||
!dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@ -133,10 +139,8 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_y == width &&
dst_stride_u * 2 == width &&
dst_stride_v * 2 == width) {
if (src_stride_argb == width * 4 && dst_stride_y == width &&
dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
width *= height;
height = 1;
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
@ -206,21 +210,23 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
}
LIBYUV_API
int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height) {
int ARGBToNV12(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
uint8* dst_uv,
int dst_stride_uv,
int width,
int height) {
int y;
int halfwidth = (width + 1) >> 1;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUVRow_C;
if (!src_argb ||
!dst_y || !dst_uv ||
width <= 0 || height == 0) {
if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@ -331,21 +337,23 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
// Same as NV12 but U and V swapped.
LIBYUV_API
int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height) {
int ARGBToNV21(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
uint8* dst_uv,
int dst_stride_uv,
int width,
int height) {
int y;
int halfwidth = (width + 1) >> 1;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUVRow_C;
if (!src_argb ||
!dst_y || !dst_uv ||
width <= 0 || height == 0) {
if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@ -456,19 +464,22 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
// Convert ARGB to YUY2.
LIBYUV_API
int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
uint8* dst_yuy2, int dst_stride_yuy2,
int width, int height) {
int ARGBToYUY2(const uint8* src_argb,
int src_stride_argb,
uint8* dst_yuy2,
int dst_stride_yuy2,
int width,
int height) {
int y;
void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, uint8* dst_u,
uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
const uint8* src_v, uint8* dst_yuy2, int width) =
I422ToYUY2Row_C;
if (!src_argb || !dst_yuy2 ||
width <= 0 || height == 0) {
if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@ -478,8 +489,7 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
dst_stride_yuy2 = -dst_stride_yuy2;
}
// Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_yuy2 == width * 2) {
if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) {
width *= height;
height = 1;
src_stride_argb = dst_stride_yuy2 = 0;
@ -582,19 +592,22 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
// Convert ARGB to UYVY.
LIBYUV_API
int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
uint8* dst_uyvy, int dst_stride_uyvy,
int width, int height) {
int ARGBToUYVY(const uint8* src_argb,
int src_stride_argb,
uint8* dst_uyvy,
int dst_stride_uyvy,
int width,
int height) {
int y;
void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, uint8* dst_u,
uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
const uint8* src_v, uint8* dst_uyvy, int width) =
I422ToUYVYRow_C;
if (!src_argb || !dst_uyvy ||
width <= 0 || height == 0) {
if (!src_argb || !dst_uyvy || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@ -604,8 +617,7 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
dst_stride_uyvy = -dst_stride_uyvy;
}
// Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_uyvy == width * 2) {
if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) {
width *= height;
height = 1;
src_stride_argb = dst_stride_uyvy = 0;
@ -708,9 +720,12 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
// Convert ARGB to I400.
LIBYUV_API
int ARGBToI400(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
int width, int height) {
int ARGBToI400(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
int width,
int height) {
int y;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
@ -723,8 +738,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_y == width) {
if (src_stride_argb == width * 4 && dst_stride_y == width) {
width *= height;
height = 1;
src_stride_argb = dst_stride_y = 0;
@ -771,26 +785,29 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
}
// Shuffle table for converting ARGB to RGBA.
static uvec8 kShuffleMaskARGBToRGBA = {
3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
};
static uvec8 kShuffleMaskARGBToRGBA = {3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u,
11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u};
// Convert ARGB to RGBA.
LIBYUV_API
int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgba, int dst_stride_rgba,
int width, int height) {
return ARGBShuffle(src_argb, src_stride_argb,
dst_rgba, dst_stride_rgba,
(const uint8*)(&kShuffleMaskARGBToRGBA),
width, height);
int ARGBToRGBA(const uint8* src_argb,
int src_stride_argb,
uint8* dst_rgba,
int dst_stride_rgba,
int width,
int height) {
return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba,
(const uint8*)(&kShuffleMaskARGBToRGBA), width, height);
}
// Convert ARGB To RGB24.
LIBYUV_API
int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb24, int dst_stride_rgb24,
int width, int height) {
int ARGBToRGB24(const uint8* src_argb,
int src_stride_argb,
uint8* dst_rgb24,
int dst_stride_rgb24,
int width,
int height) {
int y;
void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToRGB24Row_C;
@ -803,8 +820,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_rgb24 == width * 3) {
if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) {
width *= height;
height = 1;
src_stride_argb = dst_stride_rgb24 = 0;
@ -836,9 +852,12 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
// Convert ARGB To RAW.
LIBYUV_API
int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
uint8* dst_raw, int dst_stride_raw,
int width, int height) {
int ARGBToRAW(const uint8* src_argb,
int src_stride_argb,
uint8* dst_raw,
int dst_stride_raw,
int width,
int height) {
int y;
void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToRAWRow_C;
@ -851,8 +870,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_raw == width * 3) {
if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) {
width *= height;
height = 1;
src_stride_argb = dst_stride_raw = 0;
@ -884,20 +902,22 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
static const uint8 kDither565_4x4[16] = {
0, 4, 1, 5,
6, 2, 7, 3,
1, 5, 0, 4,
7, 3, 6, 2,
0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
};
// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
LIBYUV_API
int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565,
const uint8* dither4x4, int width, int height) {
int ARGBToRGB565Dither(const uint8* src_argb,
int src_stride_argb,
uint8* dst_rgb565,
int dst_stride_rgb565,
const uint8* dither4x4,
int width,
int height) {
int y;
void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
const uint32 dither4, int width) =
ARGBToRGB565DitherRow_C;
if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
return -1;
}
@ -935,7 +955,8 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
#endif
for (y = 0; y < height; ++y) {
ARGBToRGB565DitherRow(src_argb, dst_rgb565,
*(uint32*)(dither4x4 + ((y & 3) << 2)), width); /* NOLINT */
*(uint32*)(dither4x4 + ((y & 3) << 2)),
width); /* NOLINT */
src_argb += src_stride_argb;
dst_rgb565 += dst_stride_rgb565;
}
@ -945,9 +966,12 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
// Convert ARGB To RGB565.
// TODO(fbarchard): Consider using dither function low level with zeros.
LIBYUV_API
int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height) {
int ARGBToRGB565(const uint8* src_argb,
int src_stride_argb,
uint8* dst_rgb565,
int dst_stride_rgb565,
int width,
int height) {
int y;
void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToRGB565Row_C;
@ -960,8 +984,7 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_rgb565 == width * 2) {
if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) {
width *= height;
height = 1;
src_stride_argb = dst_stride_rgb565 = 0;
@ -1001,9 +1024,12 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
// Convert ARGB To ARGB1555.
LIBYUV_API
int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb1555, int dst_stride_argb1555,
int width, int height) {
int ARGBToARGB1555(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb1555,
int dst_stride_argb1555,
int width,
int height) {
int y;
void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToARGB1555Row_C;
@ -1016,8 +1042,7 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_argb1555 == width * 2) {
if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) {
width *= height;
height = 1;
src_stride_argb = dst_stride_argb1555 = 0;
@ -1057,9 +1082,12 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
// Convert ARGB To ARGB4444.
LIBYUV_API
int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb4444, int dst_stride_argb4444,
int width, int height) {
int ARGBToARGB4444(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb4444,
int dst_stride_argb4444,
int width,
int height) {
int y;
void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToARGB4444Row_C;
@ -1072,8 +1100,7 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_argb4444 == width * 2) {
if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) {
width *= height;
height = 1;
src_stride_argb = dst_stride_argb4444 = 0;
@ -1113,19 +1140,22 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
// Convert ARGB to J420. (JPeg full range I420).
LIBYUV_API
int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
uint8* dst_yj, int dst_stride_yj,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
int ARGBToJ420(const uint8* src_argb,
int src_stride_argb,
uint8* dst_yj,
int dst_stride_yj,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
ARGBToYJRow_C;
if (!src_argb ||
!dst_yj || !dst_u || !dst_v ||
width <= 0 || height == 0) {
if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@ -1187,19 +1217,22 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
// Convert ARGB to J422. (JPeg full range I422).
LIBYUV_API
int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
uint8* dst_yj, int dst_stride_yj,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
int ARGBToJ422(const uint8* src_argb,
int src_stride_argb,
uint8* dst_yj,
int dst_stride_yj,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
ARGBToYJRow_C;
if (!src_argb ||
!dst_yj || !dst_u || !dst_v ||
width <= 0 || height == 0) {
if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@ -1209,10 +1242,8 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_yj == width &&
dst_stride_u * 2 == width &&
dst_stride_v * 2 == width) {
if (src_stride_argb == width * 4 && dst_stride_yj == width &&
dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
width *= height;
height = 1;
src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
@ -1265,9 +1296,12 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
// Convert ARGB to J400.
LIBYUV_API
int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
uint8* dst_yj, int dst_stride_yj,
int width, int height) {
int ARGBToJ400(const uint8* src_argb,
int src_stride_argb,
uint8* dst_yj,
int dst_stride_yj,
int width,
int height) {
int y;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
ARGBToYJRow_C;
@ -1280,8 +1314,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_yj == width) {
if (src_stride_argb == width * 4 && dst_stride_yj == width) {
width *= height;
height = 1;
src_stride_argb = dst_stride_yj = 0;

View File

@ -37,13 +37,9 @@ static void JpegCopyI420(void* opaque,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
I420Copy(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->y, dest->y_stride,
dest->u, dest->u_stride,
dest->v, dest->v_stride,
dest->w, rows);
I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2],
dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
dest->v_stride, dest->w, rows);
dest->y += rows * dest->y_stride;
dest->u += ((rows + 1) >> 1) * dest->u_stride;
dest->v += ((rows + 1) >> 1) * dest->v_stride;
@ -55,13 +51,9 @@ static void JpegI422ToI420(void* opaque,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
I422ToI420(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->y, dest->y_stride,
dest->u, dest->u_stride,
dest->v, dest->v_stride,
dest->w, rows);
I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
dest->v_stride, dest->w, rows);
dest->y += rows * dest->y_stride;
dest->u += ((rows + 1) >> 1) * dest->u_stride;
dest->v += ((rows + 1) >> 1) * dest->v_stride;
@ -73,13 +65,9 @@ static void JpegI444ToI420(void* opaque,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
I444ToI420(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->y, dest->y_stride,
dest->u, dest->u_stride,
dest->v, dest->v_stride,
dest->w, rows);
I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
dest->v_stride, dest->w, rows);
dest->y += rows * dest->y_stride;
dest->u += ((rows + 1) >> 1) * dest->u_stride;
dest->v += ((rows + 1) >> 1) * dest->v_stride;
@ -91,11 +79,8 @@ static void JpegI400ToI420(void* opaque,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
I400ToI420(data[0], strides[0],
dest->y, dest->y_stride,
dest->u, dest->u_stride,
dest->v, dest->v_stride,
dest->w, rows);
I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u,
dest->u_stride, dest->v, dest->v_stride, dest->w, rows);
dest->y += rows * dest->y_stride;
dest->u += ((rows + 1) >> 1) * dest->u_stride;
dest->v += ((rows + 1) >> 1) * dest->v_stride;
@ -104,8 +89,7 @@ static void JpegI400ToI420(void* opaque,
// Query size of MJPG in pixels.
LIBYUV_API
int MJPGSize(const uint8* sample, size_t sample_size,
int* width, int* height) {
int MJPGSize(const uint8* sample, size_t sample_size, int* width, int* height) {
MJpegDecoder mjpeg_decoder;
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
if (ret) {
@ -121,11 +105,16 @@ int MJPGSize(const uint8* sample, size_t sample_size,
LIBYUV_API
int MJPGToI420(const uint8* sample,
size_t sample_size,
uint8* y, int y_stride,
uint8* u, int u_stride,
uint8* v, int v_stride,
int w, int h,
int dw, int dh) {
uint8* y,
int y_stride,
uint8* u,
int u_stride,
uint8* v,
int v_stride,
int w,
int h,
int dw,
int dh) {
if (sample_size == kUnknownDataSize) {
// ERROR: MJPEG frame size unknown
return -1;
@ -134,17 +123,16 @@ int MJPGToI420(const uint8* sample,
// TODO(fbarchard): Port MJpeg to C.
MJpegDecoder mjpeg_decoder;
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
if (ret && (mjpeg_decoder.GetWidth() != w ||
mjpeg_decoder.GetHeight() != h)) {
if (ret &&
(mjpeg_decoder.GetWidth() != w || mjpeg_decoder.GetHeight() != h)) {
// ERROR: MJPEG frame has unexpected dimensions
mjpeg_decoder.UnloadFrame();
return 1; // runtime failure
}
if (ret) {
I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
I420Buffers bufs = {y, y_stride, u, u_stride, v, v_stride, dw, dh};
// YUV420
if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
mjpeg_decoder.GetVertSampFactor(0) == 2 &&
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
@ -153,7 +141,7 @@ int MJPGToI420(const uint8* sample,
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
// YUV422
// YUV422
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
@ -164,7 +152,7 @@ int MJPGToI420(const uint8* sample,
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
// YUV444
// YUV444
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
@ -175,7 +163,7 @@ int MJPGToI420(const uint8* sample,
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
// YUV400
// YUV400
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceGrayscale &&
mjpeg_decoder.GetNumComponents() == 1 &&
@ -202,15 +190,12 @@ struct ARGBBuffers {
};
static void JpegI420ToARGB(void* opaque,
const uint8* const* data,
const int* strides,
int rows) {
const uint8* const* data,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
I420ToARGB(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->argb, dest->argb_stride,
dest->w, rows);
I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
dest->argb, dest->argb_stride, dest->w, rows);
dest->argb += rows * dest->argb_stride;
dest->h -= rows;
}
@ -220,11 +205,8 @@ static void JpegI422ToARGB(void* opaque,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
I422ToARGB(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->argb, dest->argb_stride,
dest->w, rows);
I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
dest->argb, dest->argb_stride, dest->w, rows);
dest->argb += rows * dest->argb_stride;
dest->h -= rows;
}
@ -234,11 +216,8 @@ static void JpegI444ToARGB(void* opaque,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
I444ToARGB(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->argb, dest->argb_stride,
dest->w, rows);
I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
dest->argb, dest->argb_stride, dest->w, rows);
dest->argb += rows * dest->argb_stride;
dest->h -= rows;
}
@ -248,9 +227,7 @@ static void JpegI400ToARGB(void* opaque,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
I400ToARGB(data[0], strides[0],
dest->argb, dest->argb_stride,
dest->w, rows);
I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows);
dest->argb += rows * dest->argb_stride;
dest->h -= rows;
}
@ -260,9 +237,12 @@ static void JpegI400ToARGB(void* opaque,
LIBYUV_API
int MJPGToARGB(const uint8* sample,
size_t sample_size,
uint8* argb, int argb_stride,
int w, int h,
int dw, int dh) {
uint8* argb,
int argb_stride,
int w,
int h,
int dw,
int dh) {
if (sample_size == kUnknownDataSize) {
// ERROR: MJPEG frame size unknown
return -1;
@ -271,17 +251,16 @@ int MJPGToARGB(const uint8* sample,
// TODO(fbarchard): Port MJpeg to C.
MJpegDecoder mjpeg_decoder;
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
if (ret && (mjpeg_decoder.GetWidth() != w ||
mjpeg_decoder.GetHeight() != h)) {
if (ret &&
(mjpeg_decoder.GetWidth() != w || mjpeg_decoder.GetHeight() != h)) {
// ERROR: MJPEG frame has unexpected dimensions
mjpeg_decoder.UnloadFrame();
return 1; // runtime failure
}
if (ret) {
ARGBBuffers bufs = { argb, argb_stride, dw, dh };
ARGBBuffers bufs = {argb, argb_stride, dw, dh};
// YUV420
if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
mjpeg_decoder.GetVertSampFactor(0) == 2 &&
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
@ -290,7 +269,7 @@ int MJPGToARGB(const uint8* sample,
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
// YUV422
// YUV422
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
@ -301,7 +280,7 @@ int MJPGToARGB(const uint8* sample,
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
// YUV444
// YUV444
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
@ -312,7 +291,7 @@ int MJPGToARGB(const uint8* sample,
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
// YUV400
// YUV400
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceGrayscale &&
mjpeg_decoder.GetNumComponents() == 1 &&

View File

@ -29,11 +29,16 @@ extern "C" {
// sample_size is measured in bytes and is the size of the frame.
// With MJPEG it is the compressed size of the frame.
LIBYUV_API
int ConvertToARGB(const uint8* sample, size_t sample_size,
uint8* crop_argb, int argb_stride,
int crop_x, int crop_y,
int src_width, int src_height,
int crop_width, int crop_height,
int ConvertToARGB(const uint8* sample,
size_t sample_size,
uint8* crop_argb,
int argb_stride,
int crop_x,
int crop_y,
int src_width,
int src_height,
int crop_width,
int crop_height,
enum RotationMode rotation,
uint32 fourcc) {
uint32 format = CanonicalFourCC(fourcc);
@ -49,16 +54,15 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
// and then rotate the I420 to the final destination buffer.
// For in-place conversion, if destination crop_argb is same as source sample,
// also enable temporary buffer.
LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
crop_argb == sample;
LIBYUV_BOOL need_buf =
(rotation && format != FOURCC_ARGB) || crop_argb == sample;
uint8* dest_argb = crop_argb;
int dest_argb_stride = argb_stride;
uint8* rotate_buffer = NULL;
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
if (crop_argb == NULL || sample == NULL ||
src_width <= 0 || crop_width <= 0 ||
src_height == 0 || crop_height == 0) {
if (crop_argb == NULL || sample == NULL || src_width <= 0 ||
crop_width <= 0 || src_height == 0 || crop_height == 0) {
return -1;
}
if (src_height < 0) {
@ -67,7 +71,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
if (need_buf) {
int argb_size = crop_width * 4 * abs_crop_height;
rotate_buffer = (uint8*)malloc(argb_size); /* NOLINT */
rotate_buffer = (uint8*)malloc(argb_size); /* NOLINT */
if (!rotate_buffer) {
return 1; // Out of memory runtime error.
}
@ -79,100 +83,83 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
// Single plane formats
case FOURCC_YUY2:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
r = YUY2ToARGB(src, aligned_src_width * 2,
crop_argb, argb_stride,
r = YUY2ToARGB(src, aligned_src_width * 2, crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_UYVY:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
r = UYVYToARGB(src, aligned_src_width * 2,
crop_argb, argb_stride,
r = UYVYToARGB(src, aligned_src_width * 2, crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_24BG:
src = sample + (src_width * crop_y + crop_x) * 3;
r = RGB24ToARGB(src, src_width * 3,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = RGB24ToARGB(src, src_width * 3, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
case FOURCC_RAW:
src = sample + (src_width * crop_y + crop_x) * 3;
r = RAWToARGB(src, src_width * 3,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = RAWToARGB(src, src_width * 3, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
case FOURCC_ARGB:
src = sample + (src_width * crop_y + crop_x) * 4;
r = ARGBToARGB(src, src_width * 4,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = ARGBToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
case FOURCC_BGRA:
src = sample + (src_width * crop_y + crop_x) * 4;
r = BGRAToARGB(src, src_width * 4,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = BGRAToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
case FOURCC_ABGR:
src = sample + (src_width * crop_y + crop_x) * 4;
r = ABGRToARGB(src, src_width * 4,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = ABGRToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
case FOURCC_RGBA:
src = sample + (src_width * crop_y + crop_x) * 4;
r = RGBAToARGB(src, src_width * 4,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = RGBAToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
case FOURCC_RGBP:
src = sample + (src_width * crop_y + crop_x) * 2;
r = RGB565ToARGB(src, src_width * 2,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = RGB565ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
case FOURCC_RGBO:
src = sample + (src_width * crop_y + crop_x) * 2;
r = ARGB1555ToARGB(src, src_width * 2,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = ARGB1555ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
case FOURCC_R444:
src = sample + (src_width * crop_y + crop_x) * 2;
r = ARGB4444ToARGB(src, src_width * 2,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = ARGB4444ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
case FOURCC_I400:
src = sample + src_width * crop_y + crop_x;
r = I400ToARGB(src, src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = I400ToARGB(src, src_width, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
// Biplanar formats
case FOURCC_NV12:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
r = NV12ToARGB(src, src_width,
src_uv, aligned_src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, crop_argb,
argb_stride, crop_width, inv_crop_height);
break;
case FOURCC_NV21:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
// Call NV12 but with u and v parameters swapped.
r = NV21ToARGB(src, src_width,
src_uv, aligned_src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, crop_argb,
argb_stride, crop_width, inv_crop_height);
break;
case FOURCC_M420:
src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
r = M420ToARGB(src, src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = M420ToARGB(src, src_width, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
// Triplanar formats
case FOURCC_I420:
@ -184,20 +171,17 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
int halfheight = (abs_src_height + 1) / 2;
if (format == FOURCC_YV12) {
src_v = sample + src_width * abs_src_height +
(halfwidth * crop_y + crop_x) / 2;
(halfwidth * crop_y + crop_x) / 2;
src_u = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
} else {
src_u = sample + src_width * abs_src_height +
(halfwidth * crop_y + crop_x) / 2;
(halfwidth * crop_y + crop_x) / 2;
src_v = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
}
r = I420ToARGB(src_y, src_width,
src_u, halfwidth,
src_v, halfwidth,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
crop_argb, argb_stride, crop_width, inv_crop_height);
break;
}
@ -208,14 +192,11 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
int halfwidth = (src_width + 1) / 2;
int halfheight = (abs_src_height + 1) / 2;
src_u = sample + src_width * abs_src_height +
(halfwidth * crop_y + crop_x) / 2;
(halfwidth * crop_y + crop_x) / 2;
src_v = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
r = J420ToARGB(src_y, src_width,
src_u, halfwidth,
src_v, halfwidth,
crop_argb, argb_stride,
crop_width, inv_crop_height);
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
crop_argb, argb_stride, crop_width, inv_crop_height);
break;
}
@ -226,21 +207,18 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
const uint8* src_v;
int halfwidth = (src_width + 1) / 2;
if (format == FOURCC_YV16) {
src_v = sample + src_width * abs_src_height +
halfwidth * crop_y + crop_x / 2;
src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
crop_x / 2;
src_u = sample + src_width * abs_src_height +
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
} else {
src_u = sample + src_width * abs_src_height +
halfwidth * crop_y + crop_x / 2;
src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
crop_x / 2;
src_v = sample + src_width * abs_src_height +
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
}
r = I422ToARGB(src_y, src_width,
src_u, halfwidth,
src_v, halfwidth,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
crop_argb, argb_stride, crop_width, inv_crop_height);
break;
}
case FOURCC_I444:
@ -255,18 +233,14 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
}
r = I444ToARGB(src_y, src_width,
src_u, src_width,
src_v, src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
crop_argb, argb_stride, crop_width, inv_crop_height);
break;
}
#ifdef HAVE_JPEG
case FOURCC_MJPG:
r = MJPGToARGB(sample, sample_size,
crop_argb, argb_stride,
src_width, abs_src_height, crop_width, inv_crop_height);
r = MJPGToARGB(sample, sample_size, crop_argb, argb_stride, src_width,
abs_src_height, crop_width, inv_crop_height);
break;
#endif
default:
@ -275,8 +249,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
if (need_buf) {
if (!r) {
r = ARGBRotate(crop_argb, argb_stride,
dest_argb, dest_argb_stride,
r = ARGBRotate(crop_argb, argb_stride, dest_argb, dest_argb_stride,
crop_width, abs_crop_height, rotation);
}
free(rotate_buffer);

View File

@ -27,12 +27,18 @@ extern "C" {
LIBYUV_API
int ConvertToI420(const uint8* sample,
size_t sample_size,
uint8* y, int y_stride,
uint8* u, int u_stride,
uint8* v, int v_stride,
int crop_x, int crop_y,
int src_width, int src_height,
int crop_width, int crop_height,
uint8* y,
int y_stride,
uint8* u,
int u_stride,
uint8* v,
int v_stride,
int crop_x,
int crop_y,
int src_width,
int src_height,
int crop_width,
int crop_height,
enum RotationMode rotation,
uint32 fourcc) {
uint32 format = CanonicalFourCC(fourcc);
@ -43,9 +49,10 @@ int ConvertToI420(const uint8* sample,
// TODO(nisse): Why allow crop_height < 0?
const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
int r = 0;
LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
format != FOURCC_NV12 && format != FOURCC_NV21 &&
format != FOURCC_YV12) || y == sample;
LIBYUV_BOOL need_buf =
(rotation && format != FOURCC_I420 && format != FOURCC_NV12 &&
format != FOURCC_NV21 && format != FOURCC_YV12) ||
y == sample;
uint8* tmp_y = y;
uint8* tmp_u = u;
uint8* tmp_v = v;
@ -56,8 +63,7 @@ int ConvertToI420(const uint8* sample,
const int inv_crop_height =
(src_height < 0) ? -abs_crop_height : abs_crop_height;
if (!y || !u || !v || !sample ||
src_width <= 0 || crop_width <= 0 ||
if (!y || !u || !v || !sample || src_width <= 0 || crop_width <= 0 ||
src_height == 0 || crop_height == 0) {
return -1;
}
@ -70,7 +76,7 @@ int ConvertToI420(const uint8* sample,
if (need_buf) {
int y_size = crop_width * abs_crop_height;
int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
rotate_buffer = (uint8*)malloc(y_size + uv_size * 2); /* NOLINT */
rotate_buffer = (uint8*)malloc(y_size + uv_size * 2); /* NOLINT */
if (!rotate_buffer) {
return 1; // Out of memory runtime error.
}
@ -85,130 +91,85 @@ int ConvertToI420(const uint8* sample,
// Single plane formats
case FOURCC_YUY2:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
r = YUY2ToI420(src, aligned_src_width * 2,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
r = YUY2ToI420(src, aligned_src_width * 2, y, y_stride, u, u_stride, v,
v_stride, crop_width, inv_crop_height);
break;
case FOURCC_UYVY:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
r = UYVYToI420(src, aligned_src_width * 2,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
r = UYVYToI420(src, aligned_src_width * 2, y, y_stride, u, u_stride, v,
v_stride, crop_width, inv_crop_height);
break;
case FOURCC_RGBP:
src = sample + (src_width * crop_y + crop_x) * 2;
r = RGB565ToI420(src, src_width * 2,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
r = RGB565ToI420(src, src_width * 2, y, y_stride, u, u_stride, v,
v_stride, crop_width, inv_crop_height);
break;
case FOURCC_RGBO:
src = sample + (src_width * crop_y + crop_x) * 2;
r = ARGB1555ToI420(src, src_width * 2,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
r = ARGB1555ToI420(src, src_width * 2, y, y_stride, u, u_stride, v,
v_stride, crop_width, inv_crop_height);
break;
case FOURCC_R444:
src = sample + (src_width * crop_y + crop_x) * 2;
r = ARGB4444ToI420(src, src_width * 2,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
r = ARGB4444ToI420(src, src_width * 2, y, y_stride, u, u_stride, v,
v_stride, crop_width, inv_crop_height);
break;
case FOURCC_24BG:
src = sample + (src_width * crop_y + crop_x) * 3;
r = RGB24ToI420(src, src_width * 3,
y, y_stride,
u, u_stride,
v, v_stride,
r = RGB24ToI420(src, src_width * 3, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_RAW:
src = sample + (src_width * crop_y + crop_x) * 3;
r = RAWToI420(src, src_width * 3,
y, y_stride,
u, u_stride,
v, v_stride,
r = RAWToI420(src, src_width * 3, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_ARGB:
src = sample + (src_width * crop_y + crop_x) * 4;
r = ARGBToI420(src, src_width * 4,
y, y_stride,
u, u_stride,
v, v_stride,
r = ARGBToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_BGRA:
src = sample + (src_width * crop_y + crop_x) * 4;
r = BGRAToI420(src, src_width * 4,
y, y_stride,
u, u_stride,
v, v_stride,
r = BGRAToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_ABGR:
src = sample + (src_width * crop_y + crop_x) * 4;
r = ABGRToI420(src, src_width * 4,
y, y_stride,
u, u_stride,
v, v_stride,
r = ABGRToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_RGBA:
src = sample + (src_width * crop_y + crop_x) * 4;
r = RGBAToI420(src, src_width * 4,
y, y_stride,
u, u_stride,
v, v_stride,
r = RGBAToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_I400:
src = sample + src_width * crop_y + crop_x;
r = I400ToI420(src, src_width,
y, y_stride,
u, u_stride,
v, v_stride,
r = I400ToI420(src, src_width, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
// Biplanar formats
case FOURCC_NV12:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + (src_width * src_height) +
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
r = NV12ToI420Rotate(src, src_width,
src_uv, aligned_src_width,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height, rotation);
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, y,
y_stride, u, u_stride, v, v_stride, crop_width,
inv_crop_height, rotation);
break;
case FOURCC_NV21:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + (src_width * src_height) +
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
// Call NV12 but with u and v parameters swapped.
r = NV12ToI420Rotate(src, src_width,
src_uv, aligned_src_width,
y, y_stride,
v, v_stride,
u, u_stride,
crop_width, inv_crop_height, rotation);
r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, y,
y_stride, v, v_stride, u, u_stride, crop_width,
inv_crop_height, rotation);
break;
case FOURCC_M420:
src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
r = M420ToI420(src, src_width,
y, y_stride,
u, u_stride,
v, v_stride,
r = M420ToI420(src, src_width, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
// Triplanar formats
@ -221,22 +182,18 @@ int ConvertToI420(const uint8* sample,
int halfheight = (abs_src_height + 1) / 2;
if (format == FOURCC_YV12) {
src_v = sample + src_width * abs_src_height +
(halfwidth * crop_y + crop_x) / 2;
(halfwidth * crop_y + crop_x) / 2;
src_u = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
} else {
src_u = sample + src_width * abs_src_height +
(halfwidth * crop_y + crop_x) / 2;
(halfwidth * crop_y + crop_x) / 2;
src_v = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
}
r = I420Rotate(src_y, src_width,
src_u, halfwidth,
src_v, halfwidth,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height, rotation);
r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth, y,
y_stride, u, u_stride, v, v_stride, crop_width,
inv_crop_height, rotation);
break;
}
case FOURCC_I422:
@ -246,23 +203,19 @@ int ConvertToI420(const uint8* sample,
const uint8* src_v;
int halfwidth = (src_width + 1) / 2;
if (format == FOURCC_YV16) {
src_v = sample + src_width * abs_src_height +
halfwidth * crop_y + crop_x / 2;
src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
crop_x / 2;
src_u = sample + src_width * abs_src_height +
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
} else {
src_u = sample + src_width * abs_src_height +
halfwidth * crop_y + crop_x / 2;
src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
crop_x / 2;
src_v = sample + src_width * abs_src_height +
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
}
r = I422ToI420(src_y, src_width,
src_u, halfwidth,
src_v, halfwidth,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth, y,
y_stride, u, u_stride, v, v_stride, crop_width,
inv_crop_height);
break;
}
case FOURCC_I444:
@ -277,21 +230,14 @@ int ConvertToI420(const uint8* sample,
src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
}
r = I444ToI420(src_y, src_width,
src_u, src_width,
src_v, src_width,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width, y,
y_stride, u, u_stride, v, v_stride, crop_width,
inv_crop_height);
break;
}
#ifdef HAVE_JPEG
case FOURCC_MJPG:
r = MJPGToI420(sample, sample_size,
y, y_stride,
u, u_stride,
v, v_stride,
r = MJPGToI420(sample, sample_size, y, y_stride, u, u_stride, v, v_stride,
src_width, abs_src_height, crop_width, inv_crop_height);
break;
#endif
@ -301,13 +247,9 @@ int ConvertToI420(const uint8* sample,
if (need_buf) {
if (!r) {
r = I420Rotate(y, y_stride,
u, u_stride,
v, v_stride,
tmp_y, tmp_y_stride,
tmp_u, tmp_u_stride,
tmp_v, tmp_v_stride,
crop_width, abs_crop_height, rotation);
r = I420Rotate(y, y_stride, u, u_stride, v, v_stride, tmp_y, tmp_y_stride,
tmp_u, tmp_u_stride, tmp_v, tmp_v_stride, crop_width,
abs_crop_height, rotation);
}
free(rotate_buffer);
}

View File

@ -13,7 +13,7 @@
#if defined(_MSC_VER)
#include <intrin.h> // For __cpuidex()
#endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
#include <immintrin.h> // For _xgetbv()
@ -44,8 +44,8 @@ extern "C" {
#endif
// Low level cpuid for X86.
#if (defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__)) && \
#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
defined(__x86_64__)) && \
!defined(__pnacl__) && !defined(__CLR_VER)
LIBYUV_API
void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
@ -74,18 +74,18 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
// GCC version uses inline x86 assembly.
#else // defined(_MSC_VER)
uint32 info_ebx, info_edx;
asm volatile (
#if defined( __i386__) && defined(__PIC__)
// Preserve ebx for fpic 32 bit.
"mov %%ebx, %%edi \n"
"cpuid \n"
"xchg %%edi, %%ebx \n"
: "=D" (info_ebx),
asm volatile(
#if defined(__i386__) && defined(__PIC__)
// Preserve ebx for fpic 32 bit.
"mov %%ebx, %%edi \n"
"cpuid \n"
"xchg %%edi, %%ebx \n"
: "=D"(info_ebx),
#else
"cpuid \n"
: "=b" (info_ebx),
"cpuid \n"
: "=b"(info_ebx),
#endif // defined( __i386__) && defined(__PIC__)
"+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
"+a"(info_eax), "+c"(info_ecx), "=d"(info_edx));
cpu_info[0] = info_eax;
cpu_info[1] = info_ebx;
cpu_info[2] = info_ecx;
@ -111,8 +111,8 @@ void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
#if defined(_M_IX86) && (_MSC_VER < 1900)
#pragma optimize("g", off)
#endif
#if (defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__)) && \
#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
defined(__x86_64__)) && \
!defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
int GetXCR0() {
@ -120,7 +120,7 @@ int GetXCR0() {
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
#elif defined(__i386__) || defined(__x86_64__)
asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
#endif // defined(__i386__) || defined(__x86_64__)
return xcr0;
}
@ -135,8 +135,7 @@ int GetXCR0() {
// based on libvpx arm_cpudetect.c
// For Arm, but public to allow testing on any CPU
LIBYUV_API SAFEBUFFERS
int ArmCpuCaps(const char* cpuinfo_name) {
LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
char cpuinfo_line[512];
FILE* f = fopen(cpuinfo_name, "r");
if (!f) {
@ -163,8 +162,8 @@ int ArmCpuCaps(const char* cpuinfo_name) {
return 0;
}
LIBYUV_API SAFEBUFFERS
int MipsCpuCaps(const char* cpuinfo_name, const char ase[]) {
LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name,
const char ase[]) {
char cpuinfo_line[512];
int len = (int)strlen(ase);
FILE* f = fopen(cpuinfo_name, "r");
@ -218,20 +217,18 @@ static LIBYUV_BOOL TestEnv(const char*) {
}
#endif
LIBYUV_API SAFEBUFFERS
int InitCpuFlags(void) {
LIBYUV_API SAFEBUFFERS int InitCpuFlags(void) {
int cpu_info = 0;
#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
uint32 cpu_info0[4] = { 0, 0, 0, 0 };
uint32 cpu_info1[4] = { 0, 0, 0, 0 };
uint32 cpu_info7[4] = { 0, 0, 0, 0 };
uint32 cpu_info0[4] = {0, 0, 0, 0};
uint32 cpu_info1[4] = {0, 0, 0, 0};
uint32 cpu_info7[4] = {0, 0, 0, 0};
CpuId(0, 0, cpu_info0);
CpuId(1, 0, cpu_info1);
if (cpu_info0[0] >= 7) {
CpuId(7, 0, cpu_info7);
}
cpu_info = kCpuHasX86 |
((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
@ -240,8 +237,7 @@ int InitCpuFlags(void) {
// AVX requires OS saves YMM registers.
if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave
((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers
cpu_info |= kCpuHasAVX |
((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0);
@ -326,7 +322,7 @@ int InitCpuFlags(void) {
if (TestEnv("LIBYUV_DISABLE_ASM")) {
cpu_info = 0;
}
cpu_info |= kCpuInitialized;
cpu_info |= kCpuInitialized;
cpu_info_ = cpu_info;
return cpu_info;
}

View File

@ -21,7 +21,7 @@
#if defined(_MSC_VER)
// disable warning 4324: structure was padded due to __declspec(align())
#pragma warning(disable:4324)
#pragma warning(disable : 4324)
#endif
#endif
@ -129,7 +129,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
if (scanlines_[i]) {
delete scanlines_[i];
}
scanlines_[i] = new uint8* [scanlines_size];
scanlines_[i] = new uint8*[scanlines_size];
scanlines_sizes_[i] = scanlines_size;
}
@ -195,13 +195,11 @@ int MJpegDecoder::GetVertSampFactor(int component) {
}
int MJpegDecoder::GetHorizSubSampFactor(int component) {
return decompress_struct_->max_h_samp_factor /
GetHorizSampFactor(component);
return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component);
}
int MJpegDecoder::GetVertSubSampFactor(int component) {
return decompress_struct_->max_v_samp_factor /
GetVertSampFactor(component);
return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component);
}
int MJpegDecoder::GetImageScanlinesPerImcuRow() {
@ -245,10 +243,10 @@ LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
}
// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
uint8** planes, int dst_width, int dst_height) {
if (dst_width != GetWidth() ||
dst_height > GetHeight()) {
LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8** planes,
int dst_width,
int dst_height) {
if (dst_width != GetWidth() || dst_height > GetHeight()) {
// ERROR: Bad dimensions
return LIBYUV_FALSE;
}
@ -289,14 +287,13 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
for (int i = 0; i < num_outbufs_; ++i) {
// TODO(fbarchard): Compute skip to avoid this
assert(skip % GetVertSubSampFactor(i) == 0);
int rows_to_skip =
DivideAndRoundDown(skip, GetVertSubSampFactor(i));
int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
rows_to_skip;
int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
int scanlines_to_copy =
GetComponentScanlinesPerImcuRow(i) - rows_to_skip;
int data_to_skip = rows_to_skip * GetComponentStride(i);
CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
planes[i], GetComponentWidth(i),
GetComponentWidth(i), scanlines_to_copy);
CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i],
GetComponentWidth(i), GetComponentWidth(i),
scanlines_to_copy);
planes[i] += scanlines_to_copy * GetComponentWidth(i);
}
lines_left -= (GetImageScanlinesPerImcuRow() - skip);
@ -305,16 +302,15 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
// Read full MCUs but cropped horizontally
for (; lines_left > GetImageScanlinesPerImcuRow();
lines_left -= GetImageScanlinesPerImcuRow()) {
lines_left -= GetImageScanlinesPerImcuRow()) {
if (!DecodeImcuRow()) {
FinishDecode();
return LIBYUV_FALSE;
}
for (int i = 0; i < num_outbufs_; ++i) {
int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
CopyPlane(databuf_[i], GetComponentStride(i),
planes[i], GetComponentWidth(i),
GetComponentWidth(i), scanlines_to_copy);
CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
planes[i] += scanlines_to_copy * GetComponentWidth(i);
}
}
@ -328,19 +324,19 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
for (int i = 0; i < num_outbufs_; ++i) {
int scanlines_to_copy =
DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
CopyPlane(databuf_[i], GetComponentStride(i),
planes[i], GetComponentWidth(i),
GetComponentWidth(i), scanlines_to_copy);
CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
planes[i] += scanlines_to_copy * GetComponentWidth(i);
}
}
return FinishDecode();
}
LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
int dst_width, int dst_height) {
if (dst_width != GetWidth() ||
dst_height > GetHeight()) {
LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn,
void* opaque,
int dst_width,
int dst_height) {
if (dst_width != GetWidth() || dst_height > GetHeight()) {
// ERROR: Bad dimensions
return LIBYUV_FALSE;
}
@ -395,7 +391,7 @@ LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
}
// Read full MCUs until we get to the crop point.
for (; lines_left >= GetImageScanlinesPerImcuRow();
lines_left -= GetImageScanlinesPerImcuRow()) {
lines_left -= GetImageScanlinesPerImcuRow()) {
if (!DecodeImcuRow()) {
FinishDecode();
return LIBYUV_FALSE;
@ -440,17 +436,17 @@ void term_source(j_decompress_ptr cinfo) {
#ifdef HAVE_SETJMP
void ErrorHandler(j_common_ptr cinfo) {
// This is called when a jpeglib command experiences an error. Unfortunately
// jpeglib's error handling model is not very flexible, because it expects the
// error handler to not return--i.e., it wants the program to terminate. To
// recover from errors we use setjmp() as shown in their example. setjmp() is
// C's implementation for the "call with current continuation" functionality
// seen in some functional programming languages.
// A formatted message can be output, but is unsafe for release.
// This is called when a jpeglib command experiences an error. Unfortunately
// jpeglib's error handling model is not very flexible, because it expects the
// error handler to not return--i.e., it wants the program to terminate. To
// recover from errors we use setjmp() as shown in their example. setjmp() is
// C's implementation for the "call with current continuation" functionality
// seen in some functional programming languages.
// A formatted message can be output, but is unsafe for release.
#ifdef DEBUG
char buf[JMSG_LENGTH_MAX];
(*cinfo->err->format_message)(cinfo, buf);
// ERROR: Error in jpeglib: buf
// ERROR: Error in jpeglib: buf
#endif
SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
@ -472,9 +468,9 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
// it.
DestroyOutputBuffers();
scanlines_ = new uint8** [num_outbufs];
scanlines_ = new uint8**[num_outbufs];
scanlines_sizes_ = new int[num_outbufs];
databuf_ = new uint8* [num_outbufs];
databuf_ = new uint8*[num_outbufs];
databuf_strides_ = new int[num_outbufs];
for (int i = 0; i < num_outbufs; ++i) {
@ -490,13 +486,13 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
void MJpegDecoder::DestroyOutputBuffers() {
for (int i = 0; i < num_outbufs_; ++i) {
delete [] scanlines_[i];
delete [] databuf_[i];
delete[] scanlines_[i];
delete[] databuf_[i];
}
delete [] scanlines_;
delete [] databuf_;
delete [] scanlines_sizes_;
delete [] databuf_strides_;
delete[] scanlines_;
delete[] databuf_;
delete[] scanlines_sizes_;
delete[] databuf_strides_;
scanlines_ = NULL;
databuf_ = NULL;
scanlines_sizes_ = NULL;
@ -542,26 +538,26 @@ void MJpegDecoder::SetScanlinePointers(uint8** data) {
inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
jpeg_read_raw_data(decompress_struct_,
scanlines_,
GetImageScanlinesPerImcuRow());
jpeg_read_raw_data(decompress_struct_, scanlines_,
GetImageScanlinesPerImcuRow());
}
// The helper function which recognizes the jpeg sub-sampling type.
JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
int* subsample_x, int* subsample_y, int number_of_components) {
int* subsample_x,
int* subsample_y,
int number_of_components) {
if (number_of_components == 3) { // Color images.
if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
subsample_x[1] == 2 && subsample_y[1] == 2 &&
subsample_x[2] == 2 && subsample_y[2] == 2) {
if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) {
return kJpegYuv420;
} else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
subsample_x[1] == 2 && subsample_y[1] == 1 &&
subsample_x[2] == 2 && subsample_y[2] == 1) {
subsample_x[1] == 2 && subsample_y[1] == 1 &&
subsample_x[2] == 2 && subsample_y[2] == 1) {
return kJpegYuv422;
} else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
subsample_x[1] == 1 && subsample_y[1] == 1 &&
subsample_x[2] == 1 && subsample_y[2] == 1) {
subsample_x[1] == 1 && subsample_y[1] == 1 &&
subsample_x[2] == 1 && subsample_y[2] == 1) {
return kJpegYuv444;
}
} else if (number_of_components == 1) { // Grey-scale images.
@ -574,4 +570,3 @@ JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
} // namespace libyuv
#endif // HAVE_JPEG

View File

@ -24,7 +24,7 @@ static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
const uint8* it = sample;
while (it < end) {
// TODO(fbarchard): scan for 0xd9 instead.
it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));
it = static_cast<const uint8*>(memchr(it, 0xff, end - it));
if (it == NULL) {
break;
}
@ -68,4 +68,3 @@ LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
} // extern "C"
} // namespace libyuv
#endif

File diff suppressed because it is too large Load Diff

View File

@ -22,12 +22,15 @@ extern "C" {
#endif
LIBYUV_API
void TransposePlane(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
void TransposePlane(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height) {
int i = height;
void (*TransposeWx8)(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) = TransposeWx8_C;
void (*TransposeWx8)(const uint8* src, int src_stride, uint8* dst,
int dst_stride, int width) = TransposeWx8_C;
#if defined(HAS_TRANSPOSEWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
TransposeWx8 = TransposeWx8_NEON;
@ -51,8 +54,8 @@ void TransposePlane(const uint8* src, int src_stride,
#endif
#if defined(HAS_TRANSPOSEWX8_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2)) {
if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
if (IS_ALIGNED(width, 4) && IS_ALIGNED(src, 4) &&
IS_ALIGNED(src_stride, 4)) {
TransposeWx8 = TransposeWx8_Fast_DSPR2;
} else {
TransposeWx8 = TransposeWx8_DSPR2;
@ -63,8 +66,8 @@ void TransposePlane(const uint8* src, int src_stride,
// Work across the source in 8x8 tiles
while (i >= 8) {
TransposeWx8(src, src_stride, dst, dst_stride, width);
src += 8 * src_stride; // Go down 8 rows.
dst += 8; // Move over 8 columns.
src += 8 * src_stride; // Go down 8 rows.
dst += 8; // Move over 8 columns.
i -= 8;
}
@ -74,9 +77,12 @@ void TransposePlane(const uint8* src, int src_stride,
}
LIBYUV_API
void RotatePlane90(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
void RotatePlane90(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height) {
// Rotate by 90 is a transpose with the source read
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
@ -86,9 +92,12 @@ void RotatePlane90(const uint8* src, int src_stride,
}
LIBYUV_API
void RotatePlane270(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
void RotatePlane270(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height) {
// Rotate by 270 is a transpose with the destination written
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
@ -98,9 +107,12 @@ void RotatePlane270(const uint8* src, int src_stride,
}
LIBYUV_API
void RotatePlane180(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
void RotatePlane180(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
align_buffer_64(row, width);
const uint8* src_bot = src + src_stride * (height - 1);
@ -135,9 +147,9 @@ void RotatePlane180(const uint8* src, int src_stride,
#endif
// TODO(fbarchard): Mirror on mips handle unaligned memory.
#if defined(HAS_MIRRORROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst, 4) &&
IS_ALIGNED(dst_stride, 4)) {
MirrorRow = MirrorRow_DSPR2;
}
#endif
@ -147,7 +159,7 @@ void RotatePlane180(const uint8* src, int src_stride,
if (IS_ALIGNED(width, 64)) {
MirrorRow = MirrorRow_MSA;
}
}
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
@ -189,14 +201,17 @@ void RotatePlane180(const uint8* src, int src_stride,
}
LIBYUV_API
void TransposeUV(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
void TransposeUV(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height) {
int i = height;
void (*TransposeUVWx8)(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
void (*TransposeUVWx8)(const uint8* src, int src_stride, uint8* dst_a,
int dst_stride_a, uint8* dst_b, int dst_stride_b,
int width) = TransposeUVWx8_C;
#if defined(HAS_TRANSPOSEUVWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
@ -212,68 +227,72 @@ void TransposeUV(const uint8* src, int src_stride,
}
#endif
#if defined(HAS_TRANSPOSEUVWX8_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) && IS_ALIGNED(src, 4) &&
IS_ALIGNED(src_stride, 4)) {
TransposeUVWx8 = TransposeUVWx8_DSPR2;
}
#endif
// Work through the source in 8x8 tiles.
while (i >= 8) {
TransposeUVWx8(src, src_stride,
dst_a, dst_stride_a,
dst_b, dst_stride_b,
TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
width);
src += 8 * src_stride; // Go down 8 rows.
dst_a += 8; // Move over 8 columns.
dst_b += 8; // Move over 8 columns.
src += 8 * src_stride; // Go down 8 rows.
dst_a += 8; // Move over 8 columns.
dst_b += 8; // Move over 8 columns.
i -= 8;
}
if (i > 0) {
TransposeUVWxH_C(src, src_stride,
dst_a, dst_stride_a,
dst_b, dst_stride_b,
TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
width, i);
}
}
LIBYUV_API
void RotateUV90(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
void RotateUV90(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height) {
src += src_stride * (height - 1);
src_stride = -src_stride;
TransposeUV(src, src_stride,
dst_a, dst_stride_a,
dst_b, dst_stride_b,
width, height);
TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
height);
}
LIBYUV_API
void RotateUV270(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
void RotateUV270(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height) {
dst_a += dst_stride_a * (width - 1);
dst_b += dst_stride_b * (width - 1);
dst_stride_a = -dst_stride_a;
dst_stride_b = -dst_stride_b;
TransposeUV(src, src_stride,
dst_a, dst_stride_a,
dst_b, dst_stride_b,
width, height);
TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
height);
}
// Rotate 180 is a horizontal and vertical flip.
LIBYUV_API
void RotateUV180(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
void RotateUV180(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height) {
int i;
void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
MirrorUVRow_C;
@ -288,8 +307,8 @@ void RotateUV180(const uint8* src, int src_stride,
}
#endif
#if defined(HAS_MIRRORUVROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src, 4) &&
IS_ALIGNED(src_stride, 4)) {
MirrorUVRow = MirrorUVRow_DSPR2;
}
#endif
@ -306,9 +325,12 @@ void RotateUV180(const uint8* src, int src_stride,
}
LIBYUV_API
int RotatePlane(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height,
int RotatePlane(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height,
enum RotationMode mode) {
if (!src || width <= 0 || height == 0 || !dst) {
return -1;
@ -324,24 +346,16 @@ int RotatePlane(const uint8* src, int src_stride,
switch (mode) {
case kRotate0:
// copy frame
CopyPlane(src, src_stride,
dst, dst_stride,
width, height);
CopyPlane(src, src_stride, dst, dst_stride, width, height);
return 0;
case kRotate90:
RotatePlane90(src, src_stride,
dst, dst_stride,
width, height);
RotatePlane90(src, src_stride, dst, dst_stride, width, height);
return 0;
case kRotate270:
RotatePlane270(src, src_stride,
dst, dst_stride,
width, height);
RotatePlane270(src, src_stride, dst, dst_stride, width, height);
return 0;
case kRotate180:
RotatePlane180(src, src_stride,
dst, dst_stride,
width, height);
RotatePlane180(src, src_stride, dst, dst_stride, width, height);
return 0;
default:
break;
@ -350,18 +364,25 @@ int RotatePlane(const uint8* src, int src_stride,
}
LIBYUV_API
int I420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height,
int I420Rotate(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height,
enum RotationMode mode) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
!dst_y || !dst_u || !dst_v) {
if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
!dst_u || !dst_v) {
return -1;
}
@ -380,45 +401,29 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
switch (mode) {
case kRotate0:
// copy frame
return I420Copy(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_y, dst_stride_y,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
width, height);
return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
dst_v, dst_stride_v, width, height);
case kRotate90:
RotatePlane90(src_y, src_stride_y,
dst_y, dst_stride_y,
width, height);
RotatePlane90(src_u, src_stride_u,
dst_u, dst_stride_u,
halfwidth, halfheight);
RotatePlane90(src_v, src_stride_v,
dst_v, dst_stride_v,
halfwidth, halfheight);
RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
halfheight);
RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
halfheight);
return 0;
case kRotate270:
RotatePlane270(src_y, src_stride_y,
dst_y, dst_stride_y,
width, height);
RotatePlane270(src_u, src_stride_u,
dst_u, dst_stride_u,
halfwidth, halfheight);
RotatePlane270(src_v, src_stride_v,
dst_v, dst_stride_v,
halfwidth, halfheight);
RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
halfheight);
RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
halfheight);
return 0;
case kRotate180:
RotatePlane180(src_y, src_stride_y,
dst_y, dst_stride_y,
width, height);
RotatePlane180(src_u, src_stride_u,
dst_u, dst_stride_u,
halfwidth, halfheight);
RotatePlane180(src_v, src_stride_v,
dst_v, dst_stride_v,
halfwidth, halfheight);
RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
halfheight);
RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
halfheight);
return 0;
default:
break;
@ -427,17 +432,23 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
}
LIBYUV_API
int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height,
int NV12ToI420Rotate(const uint8* src_y,
int src_stride_y,
const uint8* src_uv,
int src_stride_uv,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height,
enum RotationMode mode) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
if (!src_y || !src_uv || width <= 0 || height == 0 ||
!dst_y || !dst_u || !dst_v) {
if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u ||
!dst_v) {
return -1;
}
@ -454,38 +465,23 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
switch (mode) {
case kRotate0:
// copy frame
return NV12ToI420(src_y, src_stride_y,
src_uv, src_stride_uv,
dst_y, dst_stride_y,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
width, height);
case kRotate90:
RotatePlane90(src_y, src_stride_y,
dst_y, dst_stride_y,
width, height);
RotateUV90(src_uv, src_stride_uv,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
halfwidth, halfheight);
RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
dst_stride_v, halfwidth, halfheight);
return 0;
case kRotate270:
RotatePlane270(src_y, src_stride_y,
dst_y, dst_stride_y,
width, height);
RotateUV270(src_uv, src_stride_uv,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
halfwidth, halfheight);
RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
dst_stride_v, halfwidth, halfheight);
return 0;
case kRotate180:
RotatePlane180(src_y, src_stride_y,
dst_y, dst_stride_y,
width, height);
RotateUV180(src_uv, src_stride_uv,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
halfwidth, halfheight);
RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
dst_stride_v, halfwidth, halfheight);
return 0;
default:
break;

View File

@ -18,16 +18,16 @@ namespace libyuv {
extern "C" {
#endif
#define TANY(NAMEANY, TPOS_SIMD, MASK) \
void NAMEANY(const uint8* src, int src_stride, \
uint8* dst, int dst_stride, int width) { \
int r = width & MASK; \
int n = width - r; \
if (n > 0) { \
TPOS_SIMD(src, src_stride, dst, dst_stride, n); \
} \
TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\
}
#define TANY(NAMEANY, TPOS_SIMD, MASK) \
void NAMEANY(const uint8* src, int src_stride, uint8* dst, int dst_stride, \
int width) { \
int r = width & MASK; \
int n = width - r; \
if (n > 0) { \
TPOS_SIMD(src, src_stride, dst, dst_stride, n); \
} \
TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
}
#ifdef HAS_TRANSPOSEWX8_NEON
TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
@ -44,19 +44,16 @@ TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)
#undef TANY
#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \
void NAMEANY(const uint8* src, int src_stride, \
uint8* dst_a, int dst_stride_a, \
uint8* dst_b, int dst_stride_b, int width) { \
int r = width & MASK; \
int n = width - r; \
if (n > 0) { \
TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, \
n); \
} \
TransposeUVWx8_C(src + n * 2, src_stride, \
dst_a + n * dst_stride_a, dst_stride_a, \
dst_b + n * dst_stride_b, dst_stride_b, r); \
}
void NAMEANY(const uint8* src, int src_stride, uint8* dst_a, \
int dst_stride_a, uint8* dst_b, int dst_stride_b, int width) { \
int r = width & MASK; \
int n = width - r; \
if (n > 0) { \
TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \
} \
TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a, \
dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \
}
#ifdef HAS_TRANSPOSEUVWX8_NEON
TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
@ -73,8 +70,3 @@ TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
} // extern "C"
} // namespace libyuv
#endif

View File

@ -22,29 +22,44 @@ extern "C" {
// ARGBScale has a function to copy pixels to a row, striding each source
// pixel by a constant.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || \
(defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || \
(defined(__x86_64__) && !defined(__native_client__)) || \
defined(__i386__))
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
int src_stepx, uint8* dst_ptr, int dst_width);
void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr,
int src_stride,
int src_stepx,
uint8* dst_ptr,
int dst_width);
#endif
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SCALEARGBROWDOWNEVEN_NEON
void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
int src_stepx, uint8* dst_ptr, int dst_width);
void ScaleARGBRowDownEven_NEON(const uint8* src_ptr,
int src_stride,
int src_stepx,
uint8* dst_ptr,
int dst_width);
#endif
void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
int src_stepx, uint8* dst_ptr, int dst_width);
void ScaleARGBRowDownEven_C(const uint8* src_ptr,
int,
int src_stepx,
uint8* dst_ptr,
int dst_width);
static void ARGBTranspose(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height) {
static void ARGBTranspose(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height) {
int i;
int src_pixel_step = src_stride >> 2;
void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
int src_step, uint8* dst_ptr, int dst_width) =
ScaleARGBRowDownEven_C;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest.
ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
@ -63,8 +78,12 @@ static void ARGBTranspose(const uint8* src, int src_stride,
}
}
void ARGBRotate90(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height) {
void ARGBRotate90(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height) {
// Rotate by 90 is a ARGBTranspose with the source read
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
@ -73,8 +92,12 @@ void ARGBRotate90(const uint8* src, int src_stride,
ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
}
void ARGBRotate270(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height) {
void ARGBRotate270(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height) {
// Rotate by 270 is a ARGBTranspose with the destination written
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
@ -83,8 +106,12 @@ void ARGBRotate270(const uint8* src, int src_stride,
ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
}
void ARGBRotate180(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height) {
void ARGBRotate180(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
align_buffer_64(row, width * 4);
const uint8* src_bot = src + src_stride * (height - 1);
@ -154,9 +181,9 @@ void ARGBRotate180(const uint8* src, int src_stride,
// Odd height will harmlessly mirror the middle row twice.
for (y = 0; y < half_height; ++y) {
ARGBMirrorRow(src, row, width); // Mirror first row into a buffer
ARGBMirrorRow(src, row, width); // Mirror first row into a buffer
ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row
CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last
CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last
src += src_stride;
dst += dst_stride;
src_bot -= src_stride;
@ -166,8 +193,12 @@ void ARGBRotate180(const uint8* src, int src_stride,
}
LIBYUV_API
int ARGBRotate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb, int width, int height,
int ARGBRotate(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height,
enum RotationMode mode) {
if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
return -1;
@ -183,23 +214,19 @@ int ARGBRotate(const uint8* src_argb, int src_stride_argb,
switch (mode) {
case kRotate0:
// copy frame
return ARGBCopy(src_argb, src_stride_argb,
dst_argb, dst_stride_argb,
return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height);
case kRotate90:
ARGBRotate90(src_argb, src_stride_argb,
dst_argb, dst_stride_argb,
width, height);
ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
height);
return 0;
case kRotate270:
ARGBRotate270(src_argb, src_stride_argb,
dst_argb, dst_stride_argb,
width, height);
ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
height);
return 0;
case kRotate180:
ARGBRotate180(src_argb, src_stride_argb,
dst_argb, dst_stride_argb,
width, height);
ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
height);
return 0;
default:
break;

View File

@ -16,8 +16,11 @@ namespace libyuv {
extern "C" {
#endif
void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
void TransposeWx8_C(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width) {
int i;
for (i = 0; i < width; ++i) {
dst[0] = src[0 * src_stride];
@ -33,9 +36,13 @@ void TransposeWx8_C(const uint8* src, int src_stride,
}
}
void TransposeUVWx8_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width) {
void TransposeUVWx8_C(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width) {
int i;
for (i = 0; i < width; ++i) {
dst_a[0] = src[0 * src_stride + 0];
@ -60,9 +67,12 @@ void TransposeUVWx8_C(const uint8* src, int src_stride,
}
}
void TransposeWxH_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
void TransposeWxH_C(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height) {
int i;
for (i = 0; i < width; ++i) {
int j;
@ -72,10 +82,14 @@ void TransposeWxH_C(const uint8* src, int src_stride,
}
}
void TransposeUVWxH_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
void TransposeUVWxH_C(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height) {
int i;
for (i = 0; i < width * 2; i += 2) {
int j;

View File

@ -22,342 +22,348 @@ extern "C" {
// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
#if defined(HAS_TRANSPOSEWX8_SSSE3)
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"movq (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"movq (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movq (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"movq (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movq (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"lea 0x8(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"sub $0x8,%2 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
void TransposeWx8_SSSE3(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width) {
asm volatile(
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"movq (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"movq (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movq (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"movq (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movq (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"lea 0x8(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"sub $0x8,%2 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif // defined(HAS_TRANSPOSEWX8_SSSE3)
// Transpose 16x8. 64 bit
#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqu (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm8,%%xmm9 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"palignr $0x8,%%xmm9,%%xmm9 \n"
"movdqu (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm2,%%xmm10 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm10 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movdqa %%xmm10,%%xmm11 \n"
"movdqu (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"movdqu (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm4,%%xmm12 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm12 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movdqa %%xmm12,%%xmm13 \n"
"movdqu (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movdqu (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm6,%%xmm14 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"punpckhbw %%xmm7,%%xmm14 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"movdqa %%xmm14,%%xmm15 \n"
"lea 0x10(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"punpcklwd %%xmm10,%%xmm8 \n"
"punpcklwd %%xmm11,%%xmm9 \n"
"movdqa %%xmm8,%%xmm10 \n"
"movdqa %%xmm9,%%xmm11 \n"
"palignr $0x8,%%xmm10,%%xmm10 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"punpcklwd %%xmm14,%%xmm12 \n"
"punpcklwd %%xmm15,%%xmm13 \n"
"movdqa %%xmm12,%%xmm14 \n"
"movdqa %%xmm13,%%xmm15 \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm12,%%xmm8 \n"
"movq %%xmm8,(%1) \n"
"movdqa %%xmm8,%%xmm12 \n"
"palignr $0x8,%%xmm12,%%xmm12 \n"
"movq %%xmm12,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm14,%%xmm10 \n"
"movdqa %%xmm10,%%xmm14 \n"
"movq %%xmm10,(%1) \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"punpckldq %%xmm13,%%xmm9 \n"
"movq %%xmm14,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm9,%%xmm13 \n"
"movq %%xmm9,(%1) \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movq %%xmm13,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm15,%%xmm11 \n"
"movq %%xmm11,(%1) \n"
"movdqa %%xmm11,%%xmm15 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"sub $0x10,%2 \n"
"movq %%xmm15,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
);
void TransposeWx8_Fast_SSSE3(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width) {
asm volatile(
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqu (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm8,%%xmm9 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"palignr $0x8,%%xmm9,%%xmm9 \n"
"movdqu (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm2,%%xmm10 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm10 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movdqa %%xmm10,%%xmm11 \n"
"movdqu (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"movdqu (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm4,%%xmm12 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm12 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movdqa %%xmm12,%%xmm13 \n"
"movdqu (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movdqu (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm6,%%xmm14 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"punpckhbw %%xmm7,%%xmm14 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"movdqa %%xmm14,%%xmm15 \n"
"lea 0x10(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"punpcklwd %%xmm10,%%xmm8 \n"
"punpcklwd %%xmm11,%%xmm9 \n"
"movdqa %%xmm8,%%xmm10 \n"
"movdqa %%xmm9,%%xmm11 \n"
"palignr $0x8,%%xmm10,%%xmm10 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"punpcklwd %%xmm14,%%xmm12 \n"
"punpcklwd %%xmm15,%%xmm13 \n"
"movdqa %%xmm12,%%xmm14 \n"
"movdqa %%xmm13,%%xmm15 \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm12,%%xmm8 \n"
"movq %%xmm8,(%1) \n"
"movdqa %%xmm8,%%xmm12 \n"
"palignr $0x8,%%xmm12,%%xmm12 \n"
"movq %%xmm12,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm14,%%xmm10 \n"
"movdqa %%xmm10,%%xmm14 \n"
"movq %%xmm10,(%1) \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"punpckldq %%xmm13,%%xmm9 \n"
"movq %%xmm14,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm9,%%xmm13 \n"
"movq %%xmm9,(%1) \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movq %%xmm13,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm15,%%xmm11 \n"
"movq %%xmm11,(%1) \n"
"movdqa %%xmm11,%%xmm15 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"sub $0x10,%2 \n"
"movq %%xmm15,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
"xmm15");
}
#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
// Transpose UV 8x8. 64 bit.
#if defined(HAS_TRANSPOSEUVWX8_SSE2)
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqa %%xmm8,%%xmm1 \n"
"movdqu (%0),%%xmm2 \n"
"movdqu (%0,%4),%%xmm3 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm8 \n"
"movdqa %%xmm8,%%xmm3 \n"
"movdqu (%0),%%xmm4 \n"
"movdqu (%0,%4),%%xmm5 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm4,%%xmm8 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm8 \n"
"movdqa %%xmm8,%%xmm5 \n"
"movdqu (%0),%%xmm6 \n"
"movdqu (%0,%4),%%xmm7 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm6,%%xmm8 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %4 \n"
"lea 0x10(%0,%4,8),%0 \n"
"punpckhbw %%xmm7,%%xmm8 \n"
"movdqa %%xmm8,%%xmm7 \n"
"neg %4 \n"
// Second round of bit swap.
"movdqa %%xmm0,%%xmm8 \n"
"movdqa %%xmm1,%%xmm9 \n"
"punpckhwd %%xmm2,%%xmm8 \n"
"punpckhwd %%xmm3,%%xmm9 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm8,%%xmm2 \n"
"movdqa %%xmm9,%%xmm3 \n"
"movdqa %%xmm4,%%xmm8 \n"
"movdqa %%xmm5,%%xmm9 \n"
"punpckhwd %%xmm6,%%xmm8 \n"
"punpckhwd %%xmm7,%%xmm9 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm8,%%xmm6 \n"
"movdqa %%xmm9,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"movdqa %%xmm0,%%xmm8 \n"
"punpckldq %%xmm4,%%xmm0 \n"
"movlpd %%xmm0,(%1) \n" // Write back U channel
"movhpd %%xmm0,(%2) \n" // Write back V channel
"punpckhdq %%xmm4,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movlpd %%xmm2,(%1) \n"
"movhpd %%xmm2,(%2) \n"
"punpckhdq %%xmm6,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm1,%%xmm8 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movlpd %%xmm1,(%1) \n"
"movhpd %%xmm1,(%2) \n"
"punpckhdq %%xmm5,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm3,%%xmm8 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movlpd %%xmm3,(%1) \n"
"movhpd %%xmm3,(%2) \n"
"punpckhdq %%xmm7,%%xmm8 \n"
"sub $0x8,%3 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
"+r"(width) // %3
: "r"((intptr_t)(src_stride)), // %4
"r"((intptr_t)(dst_stride_a)), // %5
"r"((intptr_t)(dst_stride_b)) // %6
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9"
);
void TransposeUVWx8_SSE2(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width) {
asm volatile(
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqa %%xmm8,%%xmm1 \n"
"movdqu (%0),%%xmm2 \n"
"movdqu (%0,%4),%%xmm3 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm8 \n"
"movdqa %%xmm8,%%xmm3 \n"
"movdqu (%0),%%xmm4 \n"
"movdqu (%0,%4),%%xmm5 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm4,%%xmm8 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm8 \n"
"movdqa %%xmm8,%%xmm5 \n"
"movdqu (%0),%%xmm6 \n"
"movdqu (%0,%4),%%xmm7 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm6,%%xmm8 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %4 \n"
"lea 0x10(%0,%4,8),%0 \n"
"punpckhbw %%xmm7,%%xmm8 \n"
"movdqa %%xmm8,%%xmm7 \n"
"neg %4 \n"
// Second round of bit swap.
"movdqa %%xmm0,%%xmm8 \n"
"movdqa %%xmm1,%%xmm9 \n"
"punpckhwd %%xmm2,%%xmm8 \n"
"punpckhwd %%xmm3,%%xmm9 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm8,%%xmm2 \n"
"movdqa %%xmm9,%%xmm3 \n"
"movdqa %%xmm4,%%xmm8 \n"
"movdqa %%xmm5,%%xmm9 \n"
"punpckhwd %%xmm6,%%xmm8 \n"
"punpckhwd %%xmm7,%%xmm9 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm8,%%xmm6 \n"
"movdqa %%xmm9,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"movdqa %%xmm0,%%xmm8 \n"
"punpckldq %%xmm4,%%xmm0 \n"
"movlpd %%xmm0,(%1) \n" // Write back U channel
"movhpd %%xmm0,(%2) \n" // Write back V channel
"punpckhdq %%xmm4,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movlpd %%xmm2,(%1) \n"
"movhpd %%xmm2,(%2) \n"
"punpckhdq %%xmm6,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm1,%%xmm8 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movlpd %%xmm1,(%1) \n"
"movhpd %%xmm1,(%2) \n"
"punpckhdq %%xmm5,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm3,%%xmm8 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movlpd %%xmm3,(%1) \n"
"movhpd %%xmm3,(%2) \n"
"punpckhdq %%xmm7,%%xmm8 \n"
"sub $0x8,%3 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
"+r"(width) // %3
: "r"((intptr_t)(src_stride)), // %4
"r"((intptr_t)(dst_stride_a)), // %5
"r"((intptr_t)(dst_stride_b)) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7", "xmm8", "xmm9");
}
#endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
#endif // defined(__x86_64__) || defined(__i386__)

View File

@ -18,18 +18,20 @@ namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_MIPS) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32)
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
(__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
void TransposeWx8_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
__asm__ __volatile__ (
void TransposeWx8_DSPR2(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
"sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
"addu $t3, $t2, %[src_stride] \n"
"addu $t5, $t4, %[src_stride] \n"
"addu $t6, $t2, $t4 \n"
@ -38,8 +40,8 @@ void TransposeWx8_DSPR2(const uint8* src, int src_stride,
"or $t0, $t0, $t1 \n"
"bnez $t0, 11f \n"
" subu $t7, $t9, %[src_stride] \n"
//dst + dst_stride word aligned
"1: \n"
// dst + dst_stride word aligned
"1: \n"
"lbu $t0, 0(%[src]) \n"
"lbux $t1, %[src_stride](%[src]) \n"
"lbux $t8, $t2(%[src]) \n"
@ -65,8 +67,8 @@ void TransposeWx8_DSPR2(const uint8* src, int src_stride,
"bnez %[width], 1b \n"
" addu %[dst], %[dst], %[dst_stride] \n"
"b 2f \n"
//dst + dst_stride unaligned
"11: \n"
// dst + dst_stride unaligned
"11: \n"
"lbu $t0, 0(%[src]) \n"
"lbux $t1, %[src_stride](%[src]) \n"
"lbux $t8, $t2(%[src]) \n"
@ -92,23 +94,20 @@ void TransposeWx8_DSPR2(const uint8* src, int src_stride,
"swr $s1, 4(%[dst]) \n"
"swl $s1, 7(%[dst]) \n"
"bnez %[width], 11b \n"
"addu %[dst], %[dst], %[dst_stride] \n"
"2: \n"
"addu %[dst], %[dst], %[dst_stride] \n"
"2: \n"
".set pop \n"
:[src] "+r" (src),
[dst] "+r" (dst),
[width] "+r" (width)
:[src_stride] "r" (src_stride),
[dst_stride] "r" (dst_stride)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1"
);
: [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
: [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1");
}
void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
__asm__ __volatile__ (
void TransposeWx8_Fast_DSPR2(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width) {
__asm__ __volatile__(
".set noat \n"
".set push \n"
".set noreorder \n"
@ -126,67 +125,67 @@ void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
"or $t0, $t0, $t1 \n"
"bnez $t0, 11f \n"
" subu $t7, $t9, %[src_stride] \n"
//dst + dst_stride word aligned
// dst + dst_stride word aligned
"1: \n"
"lw $t0, 0(%[src]) \n"
"lwx $t1, %[src_stride](%[src]) \n"
"lwx $t8, $t2(%[src]) \n"
"lwx $t9, $t3(%[src]) \n"
// t0 = | 30 | 20 | 10 | 00 |
// t1 = | 31 | 21 | 11 | 01 |
// t8 = | 32 | 22 | 12 | 02 |
// t9 = | 33 | 23 | 13 | 03 |
// t0 = | 30 | 20 | 10 | 00 |
// t1 = | 31 | 21 | 11 | 01 |
// t8 = | 32 | 22 | 12 | 02 |
// t9 = | 33 | 23 | 13 | 03 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
// s0 = | 21 | 01 | 20 | 00 |
// s1 = | 23 | 03 | 22 | 02 |
// s2 = | 31 | 11 | 30 | 10 |
// s3 = | 33 | 13 | 32 | 12 |
// s0 = | 21 | 01 | 20 | 00 |
// s1 = | 23 | 03 | 22 | 02 |
// s2 = | 31 | 11 | 30 | 10 |
// s3 = | 33 | 13 | 32 | 12 |
"precr.qb.ph $s4, $s1, $s0 \n"
"precrq.qb.ph $s5, $s1, $s0 \n"
"precr.qb.ph $s6, $s3, $s2 \n"
"precrq.qb.ph $s7, $s3, $s2 \n"
// s4 = | 03 | 02 | 01 | 00 |
// s5 = | 23 | 22 | 21 | 20 |
// s6 = | 13 | 12 | 11 | 10 |
// s7 = | 33 | 32 | 31 | 30 |
// s4 = | 03 | 02 | 01 | 00 |
// s5 = | 23 | 22 | 21 | 20 |
// s6 = | 13 | 12 | 11 | 10 |
// s7 = | 33 | 32 | 31 | 30 |
"lwx $t0, $t4(%[src]) \n"
"lwx $t1, $t5(%[src]) \n"
"lwx $t8, $t6(%[src]) \n"
"lwx $t9, $t7(%[src]) \n"
// t0 = | 34 | 24 | 14 | 04 |
// t1 = | 35 | 25 | 15 | 05 |
// t8 = | 36 | 26 | 16 | 06 |
// t9 = | 37 | 27 | 17 | 07 |
// t0 = | 34 | 24 | 14 | 04 |
// t1 = | 35 | 25 | 15 | 05 |
// t8 = | 36 | 26 | 16 | 06 |
// t9 = | 37 | 27 | 17 | 07 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
// s0 = | 25 | 05 | 24 | 04 |
// s1 = | 27 | 07 | 26 | 06 |
// s2 = | 35 | 15 | 34 | 14 |
// s3 = | 37 | 17 | 36 | 16 |
// s0 = | 25 | 05 | 24 | 04 |
// s1 = | 27 | 07 | 26 | 06 |
// s2 = | 35 | 15 | 34 | 14 |
// s3 = | 37 | 17 | 36 | 16 |
"precr.qb.ph $t0, $s1, $s0 \n"
"precrq.qb.ph $t1, $s1, $s0 \n"
"precr.qb.ph $t8, $s3, $s2 \n"
"precrq.qb.ph $t9, $s3, $s2 \n"
// t0 = | 07 | 06 | 05 | 04 |
// t1 = | 27 | 26 | 25 | 24 |
// t8 = | 17 | 16 | 15 | 14 |
// t9 = | 37 | 36 | 35 | 34 |
// t0 = | 07 | 06 | 05 | 04 |
// t1 = | 27 | 26 | 25 | 24 |
// t8 = | 17 | 16 | 15 | 14 |
// t9 = | 37 | 36 | 35 | 34 |
"addu $s0, %[dst], %[dst_stride] \n"
"addu $s1, $s0, %[dst_stride] \n"
@ -207,67 +206,67 @@ void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
"bnez $AT, 1b \n"
" addu %[dst], $s2, %[dst_stride] \n"
"b 2f \n"
//dst + dst_stride unaligned
// dst + dst_stride unaligned
"11: \n"
"lw $t0, 0(%[src]) \n"
"lwx $t1, %[src_stride](%[src]) \n"
"lwx $t8, $t2(%[src]) \n"
"lwx $t9, $t3(%[src]) \n"
// t0 = | 30 | 20 | 10 | 00 |
// t1 = | 31 | 21 | 11 | 01 |
// t8 = | 32 | 22 | 12 | 02 |
// t9 = | 33 | 23 | 13 | 03 |
// t0 = | 30 | 20 | 10 | 00 |
// t1 = | 31 | 21 | 11 | 01 |
// t8 = | 32 | 22 | 12 | 02 |
// t9 = | 33 | 23 | 13 | 03 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
// s0 = | 21 | 01 | 20 | 00 |
// s1 = | 23 | 03 | 22 | 02 |
// s2 = | 31 | 11 | 30 | 10 |
// s3 = | 33 | 13 | 32 | 12 |
// s0 = | 21 | 01 | 20 | 00 |
// s1 = | 23 | 03 | 22 | 02 |
// s2 = | 31 | 11 | 30 | 10 |
// s3 = | 33 | 13 | 32 | 12 |
"precr.qb.ph $s4, $s1, $s0 \n"
"precrq.qb.ph $s5, $s1, $s0 \n"
"precr.qb.ph $s6, $s3, $s2 \n"
"precrq.qb.ph $s7, $s3, $s2 \n"
// s4 = | 03 | 02 | 01 | 00 |
// s5 = | 23 | 22 | 21 | 20 |
// s6 = | 13 | 12 | 11 | 10 |
// s7 = | 33 | 32 | 31 | 30 |
// s4 = | 03 | 02 | 01 | 00 |
// s5 = | 23 | 22 | 21 | 20 |
// s6 = | 13 | 12 | 11 | 10 |
// s7 = | 33 | 32 | 31 | 30 |
"lwx $t0, $t4(%[src]) \n"
"lwx $t1, $t5(%[src]) \n"
"lwx $t8, $t6(%[src]) \n"
"lwx $t9, $t7(%[src]) \n"
// t0 = | 34 | 24 | 14 | 04 |
// t1 = | 35 | 25 | 15 | 05 |
// t8 = | 36 | 26 | 16 | 06 |
// t9 = | 37 | 27 | 17 | 07 |
// t0 = | 34 | 24 | 14 | 04 |
// t1 = | 35 | 25 | 15 | 05 |
// t8 = | 36 | 26 | 16 | 06 |
// t9 = | 37 | 27 | 17 | 07 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
// s0 = | 25 | 05 | 24 | 04 |
// s1 = | 27 | 07 | 26 | 06 |
// s2 = | 35 | 15 | 34 | 14 |
// s3 = | 37 | 17 | 36 | 16 |
// s0 = | 25 | 05 | 24 | 04 |
// s1 = | 27 | 07 | 26 | 06 |
// s2 = | 35 | 15 | 34 | 14 |
// s3 = | 37 | 17 | 36 | 16 |
"precr.qb.ph $t0, $s1, $s0 \n"
"precrq.qb.ph $t1, $s1, $s0 \n"
"precr.qb.ph $t8, $s3, $s2 \n"
"precrq.qb.ph $t9, $s3, $s2 \n"
// t0 = | 07 | 06 | 05 | 04 |
// t1 = | 27 | 26 | 25 | 24 |
// t8 = | 17 | 16 | 15 | 14 |
// t9 = | 37 | 36 | 35 | 34 |
// t0 = | 07 | 06 | 05 | 04 |
// t1 = | 27 | 26 | 25 | 24 |
// t8 = | 17 | 16 | 15 | 14 |
// t9 = | 37 | 36 | 35 | 34 |
"addu $s0, %[dst], %[dst_stride] \n"
"addu $s1, $s0, %[dst_stride] \n"
@ -298,34 +297,33 @@ void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
"2: \n"
".set pop \n"
".set at \n"
:[src] "+r" (src),
[dst] "+r" (dst),
[width] "+r" (width)
:[src_stride] "r" (src_stride),
[dst_stride] "r" (dst_stride)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
);
: [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
: [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
"s2", "s3", "s4", "s5", "s6", "s7");
}
void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
void TransposeUVWx8_DSPR2(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width) {
__asm__ __volatile__ (
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
" sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
"addu $t3, $t2, %[src_stride] \n"
"addu $t5, $t4, %[src_stride] \n"
"addu $t6, $t2, $t4 \n"
"subu $t7, $t9, %[src_stride] \n"
"srl $t1, %[width], 1 \n"
// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
"andi $t0, %[dst_a], 0x3 \n"
"andi $t8, %[dst_b], 0x3 \n"
"or $t0, $t0, $t8 \n"
@ -335,52 +333,52 @@ void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
"or $t0, $t0, $t8 \n"
"bnez $t0, 11f \n"
" nop \n"
// dst + dst_stride word aligned (both, a & b dst addresses)
"1: \n"
"lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
"lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
// dst + dst_stride word aligned (both, a & b dst addresses)
"1: \n"
"lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
"lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
"addu $s5, %[dst_a], %[dst_stride_a] \n"
"lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
"lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
"lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
"lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
"addu $s6, %[dst_b], %[dst_stride_b] \n"
"precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
"precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
"precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
"precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
"precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
"precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
"sll $t0, $t0, 16 \n"
"packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
"packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
"sll $t9, $t9, 16 \n"
"packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
"packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
"sw $s3, 0($s5) \n"
"sw $s4, 0($s6) \n"
"precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
"precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
"lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
"lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
"lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
"lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
"lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
"lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
"lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
"lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
"sw $s3, 0(%[dst_a]) \n"
"sw $s4, 0(%[dst_b]) \n"
"precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
"precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
"precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
"precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
"precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
"precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
"sll $t0, $t0, 16 \n"
"packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
"packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
"sll $t9, $t9, 16 \n"
"packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
"packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
"sw $s3, 4($s5) \n"
"sw $s4, 4($s6) \n"
"precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
"precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
"addiu %[src], 4 \n"
"addiu $t1, -1 \n"
@ -394,59 +392,59 @@ void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
"b 2f \n"
" nop \n"
// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
"11: \n"
"lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
"lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
"11: \n"
"lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
"lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
"addu $s5, %[dst_a], %[dst_stride_a] \n"
"lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
"lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
"lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
"lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
"addu $s6, %[dst_b], %[dst_stride_b] \n"
"precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
"precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
"precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
"precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
"precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
"precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
"sll $t0, $t0, 16 \n"
"packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
"packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
"sll $t9, $t9, 16 \n"
"packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
"packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
"swr $s3, 0($s5) \n"
"swl $s3, 3($s5) \n"
"swr $s4, 0($s6) \n"
"swl $s4, 3($s6) \n"
"precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
"precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
"lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
"lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
"lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
"lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
"lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
"lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
"lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
"lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
"swr $s3, 0(%[dst_a]) \n"
"swl $s3, 3(%[dst_a]) \n"
"swr $s4, 0(%[dst_b]) \n"
"swl $s4, 3(%[dst_b]) \n"
"precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
"precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
"precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
"precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
"precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
"precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
"sll $t0, $t0, 16 \n"
"packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
"packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
"sll $t9, $t9, 16 \n"
"packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
"packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
"swr $s3, 4($s5) \n"
"swl $s3, 7($s5) \n"
"swr $s4, 4($s6) \n"
"swl $s4, 7($s6) \n"
"precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
"precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
"addiu %[src], 4 \n"
"addiu $t1, -1 \n"
@ -462,18 +460,11 @@ void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
"2: \n"
".set pop \n"
: [src] "+r" (src),
[dst_a] "+r" (dst_a),
[dst_b] "+r" (dst_b),
[width] "+r" (width),
[src_stride] "+r" (src_stride)
: [dst_stride_a] "r" (dst_stride_a),
[dst_stride_b] "r" (dst_stride_b)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6"
);
: [src] "+r"(src), [dst_a] "+r"(dst_a), [dst_b] "+r"(dst_b),
[width] "+r"(width), [src_stride] "+r"(src_stride)
: [dst_stride_a] "r"(dst_stride_a), [dst_stride_b] "r"(dst_stride_b)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
"s2", "s3", "s4", "s5", "s6");
}
#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)

View File

@ -21,11 +21,13 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
static uvec8 kVTbl4x4Transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15};
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
void TransposeWx8_NEON(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width) {
const uint8* src_temp;
asm volatile (
@ -240,12 +242,15 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
);
}
static uvec8 kVTbl4x4TransposeDi =
{ 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
static uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11,
4, 12, 5, 13, 6, 14, 7, 15};
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
void TransposeUVWx8_NEON(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width) {
const uint8* src_temp;
asm volatile (

View File

@ -21,13 +21,16 @@ extern "C" {
// This module is for GCC Neon armv8 64 bit.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
static uvec8 kVTbl4x4Transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15};
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
void TransposeWx8_NEON(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width) {
const uint8* src_temp;
int64 width64 = (int64) width; // Work around clang 3.4 warning.
int64 width64 = (int64)width; // Work around clang 3.4 warning.
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
@ -247,16 +250,19 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
);
}
static uint8 kVTbl4x4TransposeDi[32] =
{ 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
static uint8 kVTbl4x4TransposeDi[32] = {
0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
void TransposeUVWx8_NEON(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width) {
const uint8* src_temp;
int64 width64 = (int64) width; // Work around clang 3.4 warning.
int64 width64 = (int64)width; // Work around clang 3.4 warning.
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter

View File

@ -19,15 +19,17 @@ extern "C" {
// This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
__declspec(naked)
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
__declspec(naked) void TransposeWx8_SSSE3(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width) {
__asm {
push edi
push esi
push ebp
mov eax, [esp + 12 + 4] // src
mov edi, [esp + 12 + 8] // src_stride
mov eax, [esp + 12 + 4] // src
mov edi, [esp + 12 + 8] // src_stride
mov edx, [esp + 12 + 12] // dst
mov esi, [esp + 12 + 16] // dst_stride
mov ecx, [esp + 12 + 20] // width
@ -110,18 +112,20 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride,
}
}
__declspec(naked)
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w) {
__declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int w) {
__asm {
push ebx
push esi
push edi
push ebp
mov eax, [esp + 16 + 4] // src
mov edi, [esp + 16 + 8] // src_stride
mov eax, [esp + 16 + 4] // src
mov edi, [esp + 16 + 8] // src_stride
mov edx, [esp + 16 + 12] // dst_a
mov esi, [esp + 16 + 16] // dst_stride_a
mov ebx, [esp + 16 + 20] // dst_b
@ -134,8 +138,8 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
align 4
convertloop:
// Read in the data from the source pointer.
// First round of bit swap.
// Read in the data from the source pointer.
// First round of bit swap.
movdqu xmm0, [eax]
movdqu xmm1, [eax + edi]
lea eax, [eax + 2 * edi]
@ -162,7 +166,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
lea eax, [eax + 2 * edi]
movdqu [esp], xmm5 // backup xmm5
neg edi
movdqa xmm5, xmm6 // use xmm5 as temp register.
movdqa xmm5, xmm6 // use xmm5 as temp register.
punpcklbw xmm6, xmm7
punpckhbw xmm5, xmm7
movdqa xmm7, xmm5
@ -183,7 +187,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
movdqa xmm6, xmm5
movdqu xmm5, [esp] // restore xmm5
movdqu [esp], xmm6 // backup xmm6
movdqa xmm6, xmm5 // use xmm6 as temp register.
movdqa xmm6, xmm5 // use xmm6 as temp register.
punpcklwd xmm5, xmm7
punpckhwd xmm6, xmm7
movdqa xmm7, xmm6
@ -200,7 +204,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm4
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm2 // use xmm0 as the temp register.
movdqa xmm0, xmm2 // use xmm0 as the temp register.
punpckldq xmm2, xmm6
movlpd qword ptr [edx], xmm2
movhpd qword ptr [ebx], xmm2
@ -209,7 +213,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm1 // use xmm0 as the temp register.
movdqa xmm0, xmm1 // use xmm0 as the temp register.
punpckldq xmm1, xmm5
movlpd qword ptr [edx], xmm1
movhpd qword ptr [ebx], xmm1
@ -218,7 +222,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm3 // use xmm0 as the temp register.
movdqa xmm0, xmm3 // use xmm0 as the temp register.
punpckldq xmm3, xmm7
movlpd qword ptr [edx], xmm3
movhpd qword ptr [ebx], xmm3

View File

@ -23,26 +23,26 @@ extern "C" {
#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
// Any 4 planes to 1 with yuvconstants
#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
const uint8* a_buf, uint8* dst_ptr, \
const struct YuvConstants* yuvconstants, int width) { \
SIMD_ALIGNED(uint8 temp[64 * 5]); \
memset(temp, 0, 64 * 4); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n, r); \
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 192, a_buf + n, r); \
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \
yuvconstants, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \
SS(r, DUVSHIFT) * BPP); \
}
#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
const uint8* a_buf, uint8* dst_ptr, \
const struct YuvConstants* yuvconstants, int width) { \
SIMD_ALIGNED(uint8 temp[64 * 5]); \
memset(temp, 0, 64 * 4); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n, r); \
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 192, a_buf + n, r); \
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \
yuvconstants, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \
SS(r, DUVSHIFT) * BPP); \
}
#ifdef HAS_I422ALPHATOARGBROW_SSSE3
ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
@ -59,23 +59,23 @@ ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
#undef ANY41C
// Any 3 planes to 1.
#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
uint8* dst_ptr, int width) { \
SIMD_ALIGNED(uint8 temp[64 * 4]); \
memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \
} \
memcpy(temp, y_buf + n, r); \
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
SS(r, DUVSHIFT) * BPP); \
}
#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
uint8* dst_ptr, int width) { \
SIMD_ALIGNED(uint8 temp[64 * 4]); \
memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \
} \
memcpy(temp, y_buf + n, r); \
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
SS(r, DUVSHIFT) * BPP); \
}
#ifdef HAS_I422TOYUY2ROW_SSE2
ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
@ -104,28 +104,27 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
// on arm that subsamples 444 to 422 internally.
// Any 3 planes to 1 with yuvconstants
#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
int width) { \
SIMD_ALIGNED(uint8 temp[64 * 4]); \
memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n, r); \
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
if (width & 1) { \
temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \
temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
} \
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, \
yuvconstants, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
SS(r, DUVSHIFT) * BPP); \
}
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
int width) { \
SIMD_ALIGNED(uint8 temp[64 * 4]); \
memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n, r); \
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
if (width & 1) { \
temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \
temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
} \
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, yuvconstants, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
SS(r, DUVSHIFT) * BPP); \
}
#ifdef HAS_I422TOARGBROW_SSSE3
ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
@ -179,22 +178,22 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
#undef ANY31C
// Any 2 planes to 1.
#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \
uint8* dst_ptr, int width) { \
SIMD_ALIGNED(uint8 temp[64 * 3]); \
memset(temp, 0, 64 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
} \
memcpy(temp, y_buf + n * SBPP, r * SBPP); \
memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
SS(r, UVSHIFT) * SBPP2); \
ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* uv_buf, uint8* dst_ptr, \
int width) { \
SIMD_ALIGNED(uint8 temp[64 * 3]); \
memset(temp, 0, 64 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
} \
memcpy(temp, y_buf + n * SBPP, r * SBPP); \
memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
SS(r, UVSHIFT) * SBPP2); \
ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
// Merge functions.
#ifdef HAS_MERGEUVROW_SSE2
@ -256,23 +255,22 @@ ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
#undef ANY21
// Any 2 planes to 1 with yuvconstants
#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \
uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
int width) { \
SIMD_ALIGNED(uint8 temp[64 * 3]); \
memset(temp, 0, 64 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n * SBPP, r * SBPP); \
memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
SS(r, UVSHIFT) * SBPP2); \
ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* uv_buf, uint8* dst_ptr, \
const struct YuvConstants* yuvconstants, int width) { \
SIMD_ALIGNED(uint8 temp[64 * 3]); \
memset(temp, 0, 64 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n * SBPP, r * SBPP); \
memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
SS(r, UVSHIFT) * SBPP2); \
ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
// Biplanar to RGB.
#ifdef HAS_NV12TOARGBROW_SSSE3
@ -305,19 +303,19 @@ ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
#undef ANY21C
// Any 1 to 1.
#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
SIMD_ALIGNED(uint8 temp[128 * 2]); \
memset(temp, 0, 128); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
ANY_SIMD(temp, temp + 128, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
SIMD_ALIGNED(uint8 temp[128 * 2]); \
memset(temp, 0, 128); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
ANY_SIMD(temp, temp + 128, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#ifdef HAS_COPYROW_AVX
ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
@ -500,20 +498,20 @@ ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
#undef ANY11
// Any 1 to 1 blended. Destination is read, modify, write.
#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
SIMD_ALIGNED(uint8 temp[128 * 2]); \
memset(temp, 0, 128 * 2); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
memcpy(temp + 128, dst_ptr + n * BPP, r * BPP); \
ANY_SIMD(temp, temp + 128, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
SIMD_ALIGNED(uint8 temp[128 * 2]); \
memset(temp, 0, 128 * 2); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
memcpy(temp + 128, dst_ptr + n * BPP, r * BPP); \
ANY_SIMD(temp, temp + 128, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#ifdef HAS_ARGBCOPYALPHAROW_AVX2
ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
@ -530,32 +528,43 @@ ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
#undef ANY11B
// Any 1 to 1 with parameter.
#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
T shuffler, int width) { \
SIMD_ALIGNED(uint8 temp[64 * 2]); \
memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \
} \
memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
}
#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, T shuffler, int width) { \
SIMD_ALIGNED(uint8 temp[64 * 2]); \
memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \
} \
memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
}
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,
const uint32, 4, 2, 3)
ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
ARGBToRGB565DitherRow_SSE2,
const uint32,
4,
2,
3)
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2,
const uint32, 4, 2, 7)
ANY11P(ARGBToRGB565DitherRow_Any_AVX2,
ARGBToRGB565DitherRow_AVX2,
const uint32,
4,
2,
7)
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON,
const uint32, 4, 2, 7)
ANY11P(ARGBToRGB565DitherRow_Any_NEON,
ARGBToRGB565DitherRow_NEON,
const uint32,
4,
2,
7)
#endif
#ifdef HAS_ARGBSHUFFLEROW_SSE2
ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
@ -572,20 +581,20 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
#undef ANY11P
// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
void NAMEANY(const uint16* src_ptr, uint16* dst_ptr, \
T shuffler, int width) { \
SIMD_ALIGNED(uint16 temp[32 * 2]); \
memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \
} \
memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
}
#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
void NAMEANY(const uint16* src_ptr, uint16* dst_ptr, T shuffler, \
int width) { \
SIMD_ALIGNED(uint16 temp[32 * 2]); \
memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \
} \
memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
}
#ifdef HAS_HALFFLOATROW_SSE2
ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 1, 1, 7)
@ -604,20 +613,20 @@ ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 1, 1, 7)
#undef ANY11P16
// Any 1 to 1 with yuvconstants
#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
const struct YuvConstants* yuvconstants, int width) { \
SIMD_ALIGNED(uint8 temp[128 * 2]); \
memset(temp, 0, 128); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
const struct YuvConstants* yuvconstants, int width) { \
SIMD_ALIGNED(uint8 temp[128 * 2]); \
memset(temp, 0, 128); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#if defined(HAS_YUY2TOARGBROW_SSSE3)
ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
@ -634,21 +643,20 @@ ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
// Any 1 to 1 interpolate. Takes 2 rows of source via stride.
#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
ptrdiff_t src_stride_ptr, int width, \
int source_y_fraction) { \
SIMD_ALIGNED(uint8 temp[64 * 3]); \
memset(temp, 0, 64 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \
} \
memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \
ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, \
int width, int source_y_fraction) { \
SIMD_ALIGNED(uint8 temp[64 * 3]); \
memset(temp, 0, 64 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \
} \
memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \
ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#ifdef HAS_INTERPOLATEROW_AVX2
ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
@ -665,19 +673,19 @@ ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)
#undef ANY11T
// Any 1 to 1 mirror.
#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
SIMD_ALIGNED(uint8 temp[64 * 2]); \
memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \
} \
memcpy(temp, src_ptr, r * BPP); \
ANY_SIMD(temp, temp + 64, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
}
#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
SIMD_ALIGNED(uint8 temp[64 * 2]); \
memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \
} \
memcpy(temp, src_ptr, r* BPP); \
ANY_SIMD(temp, temp + 64, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
}
#ifdef HAS_MIRRORROW_AVX2
ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
@ -706,17 +714,17 @@ ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
#undef ANY11M
// Any 1 plane. (memset)
#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \
void NAMEANY(uint8* dst_ptr, T v32, int width) { \
SIMD_ALIGNED(uint8 temp[64]); \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(dst_ptr, v32, n); \
} \
ANY_SIMD(temp, v32, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp, r * BPP); \
}
#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \
void NAMEANY(uint8* dst_ptr, T v32, int width) { \
SIMD_ALIGNED(uint8 temp[64]); \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(dst_ptr, v32, n); \
} \
ANY_SIMD(temp, v32, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp, r * BPP); \
}
#ifdef HAS_SETROW_X86
ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)
@ -730,20 +738,20 @@ ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
#undef ANY1
// Any 1 to 2. Outputs UV planes.
#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\
SIMD_ALIGNED(uint8 temp[128 * 3]); \
memset(temp, 0, 128); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_u, dst_v, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \
memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \
}
#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) { \
SIMD_ALIGNED(uint8 temp[128 * 3]); \
memset(temp, 0, 128); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_u, dst_v, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \
memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \
}
#ifdef HAS_SPLITUVROW_SSE2
ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
@ -781,29 +789,29 @@ ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
// 128 byte row allows for 32 avx ARGB pixels.
#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, int src_stride_ptr, \
uint8* dst_u, uint8* dst_v, int width) { \
SIMD_ALIGNED(uint8 temp[128 * 4]); \
memset(temp, 0, 128 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
SS(r, UVSHIFT) * BPP); \
if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */\
memcpy(temp + SS(r, UVSHIFT) * BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \
temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
} \
ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \
memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \
memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \
}
#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, int src_stride_ptr, uint8* dst_u, \
uint8* dst_v, int width) { \
SIMD_ALIGNED(uint8 temp[128 * 4]); \
memset(temp, 0, 128 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
SS(r, UVSHIFT) * BPP); \
if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
BPP); \
memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \
temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
} \
ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \
memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \
memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \
}
#ifdef HAS_ARGBTOUVROW_AVX2
ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)

File diff suppressed because it is too large Load Diff

View File

@ -16,6 +16,8 @@ namespace libyuv {
extern "C" {
#endif
// clang-format off
// This module is for GCC x86 and x64.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
@ -2566,7 +2568,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"vpsrlw $0x8,%%ymm5,%%ymm5 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
@ -2603,7 +2605,7 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
@ -2637,7 +2639,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
asm volatile (
"sub %0,%1 \n"
LABELALIGN
"1: \n"
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
"lea " MEMLEA(0x20,0) ",%0 \n"
@ -2668,7 +2670,7 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
asm volatile (
"sub %0,%1 \n"
LABELALIGN
"1: \n"
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
@ -5146,7 +5148,7 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
asm volatile (
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"1: \n"
"movq " MEMACCESS(1) ",%%xmm2 \n"
MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
"lea " MEMLEA(0x8,1) ",%1 \n"
@ -5181,7 +5183,7 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
asm volatile (
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"1: \n"
"movq " MEMACCESS(1) ",%%xmm2 \n"
MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
"lea " MEMLEA(0x8,1) ",%1 \n"
@ -5602,6 +5604,8 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
#endif // defined(__x86_64__) || defined(__i386__)
// clang-format on
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

File diff suppressed because it is too large Load Diff

View File

@ -20,108 +20,111 @@ extern "C" {
#endif
// Fill YUV -> RGB conversion constants into vectors
#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) { \
ub = __msa_fill_w(yuvconst->kUVToB[0]); \
vr = __msa_fill_w(yuvconst->kUVToR[1]); \
ug = __msa_fill_w(yuvconst->kUVToG[0]); \
vg = __msa_fill_w(yuvconst->kUVToG[1]); \
bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \
bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \
br = __msa_fill_w(yuvconst->kUVBiasR[0]); \
yg = __msa_fill_w(yuvconst->kYToRgb[0]); \
}
#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \
{ \
ub = __msa_fill_w(yuvconst->kUVToB[0]); \
vr = __msa_fill_w(yuvconst->kUVToR[1]); \
ug = __msa_fill_w(yuvconst->kUVToG[0]); \
vg = __msa_fill_w(yuvconst->kUVToG[1]); \
bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \
bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \
br = __msa_fill_w(yuvconst->kUVBiasR[0]); \
yg = __msa_fill_w(yuvconst->kYToRgb[0]); \
}
// Load YUV 422 pixel data
#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) { \
uint64 y_m; \
uint32 u_m, v_m; \
v4i32 zero_m = { 0 }; \
y_m = LD(psrc_y); \
u_m = LW(psrc_u); \
v_m = LW(psrc_v); \
out_y = (v16u8) __msa_insert_d((v2i64) zero_m, 0, (int64) y_m); \
out_u = (v16u8) __msa_insert_w(zero_m, 0, (int32) u_m); \
out_v = (v16u8) __msa_insert_w(zero_m, 0, (int32) v_m); \
}
#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
{ \
uint64 y_m; \
uint32 u_m, v_m; \
v4i32 zero_m = {0}; \
y_m = LD(psrc_y); \
u_m = LW(psrc_u); \
v_m = LW(psrc_v); \
out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64)y_m); \
out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m); \
out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \
}
// Convert 8 pixels of YUV 420 to RGB.
#define YUVTORGB(in_y, in_u, in_v, \
ub, vr, ug, vg, bb, bg, br, yg, \
out_b, out_g, out_r) { \
v8i16 vec0_m; \
v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
v4i32 reg5_m, reg6_m, reg7_m, reg8_m, reg9_m; \
v4i32 max_val_m = __msa_ldi_w(255); \
v8i16 zero_m = { 0 }; \
\
in_u = (v16u8) __msa_ilvr_b((v16i8) in_u, (v16i8) in_u); \
in_v = (v16u8) __msa_ilvr_b((v16i8) in_v, (v16i8) in_v); \
vec0_m = (v8i16) __msa_ilvr_b((v16i8) in_y, (v16i8) in_y); \
reg0_m = (v4i32) __msa_ilvr_h(zero_m, vec0_m); \
reg1_m = (v4i32) __msa_ilvl_h(zero_m, vec0_m); \
reg0_m *= vec_yg; \
reg1_m *= vec_yg; \
reg0_m = __msa_srai_w(reg0_m, 16); \
reg1_m = __msa_srai_w(reg1_m, 16); \
reg4_m = reg0_m + br; \
reg5_m = reg1_m + br; \
reg2_m = reg0_m + bg; \
reg3_m = reg1_m + bg; \
reg0_m += bb; \
reg1_m += bb; \
vec0_m = (v8i16) __msa_ilvr_b((v16i8) zero_m, (v16i8) in_u); \
reg6_m = (v4i32) __msa_ilvr_h(zero_m, (v8i16) vec0_m); \
reg7_m = (v4i32) __msa_ilvl_h(zero_m, (v8i16) vec0_m); \
vec0_m = (v8i16) __msa_ilvr_b((v16i8) zero_m, (v16i8) in_v); \
reg8_m = (v4i32) __msa_ilvr_h(zero_m, (v8i16) vec0_m); \
reg9_m = (v4i32) __msa_ilvl_h(zero_m, (v8i16) vec0_m); \
reg0_m -= reg6_m * ub; \
reg1_m -= reg7_m * ub; \
reg2_m -= reg6_m * ug; \
reg3_m -= reg7_m * ug; \
reg4_m -= reg8_m * vr; \
reg5_m -= reg9_m * vr; \
reg2_m -= reg8_m * vg; \
reg3_m -= reg9_m * vg; \
reg0_m = __msa_srai_w(reg0_m, 6); \
reg1_m = __msa_srai_w(reg1_m, 6); \
reg2_m = __msa_srai_w(reg2_m, 6); \
reg3_m = __msa_srai_w(reg3_m, 6); \
reg4_m = __msa_srai_w(reg4_m, 6); \
reg5_m = __msa_srai_w(reg5_m, 6); \
reg0_m = __msa_maxi_s_w(reg0_m, 0); \
reg1_m = __msa_maxi_s_w(reg1_m, 0); \
reg2_m = __msa_maxi_s_w(reg2_m, 0); \
reg3_m = __msa_maxi_s_w(reg3_m, 0); \
reg4_m = __msa_maxi_s_w(reg4_m, 0); \
reg5_m = __msa_maxi_s_w(reg5_m, 0); \
reg0_m = __msa_min_s_w(reg0_m, max_val_m); \
reg1_m = __msa_min_s_w(reg1_m, max_val_m); \
reg2_m = __msa_min_s_w(reg2_m, max_val_m); \
reg3_m = __msa_min_s_w(reg3_m, max_val_m); \
reg4_m = __msa_min_s_w(reg4_m, max_val_m); \
reg5_m = __msa_min_s_w(reg5_m, max_val_m); \
out_b = __msa_pckev_h((v8i16) reg1_m, (v8i16) reg0_m); \
out_g = __msa_pckev_h((v8i16) reg3_m, (v8i16) reg2_m); \
out_r = __msa_pckev_h((v8i16) reg5_m, (v8i16) reg4_m); \
}
#define YUVTORGB(in_y, in_u, in_v, ub, vr, ug, vg, bb, bg, br, yg, out_b, \
out_g, out_r) \
{ \
v8i16 vec0_m; \
v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
v4i32 reg5_m, reg6_m, reg7_m, reg8_m, reg9_m; \
v4i32 max_val_m = __msa_ldi_w(255); \
v8i16 zero_m = {0}; \
\
in_u = (v16u8)__msa_ilvr_b((v16i8)in_u, (v16i8)in_u); \
in_v = (v16u8)__msa_ilvr_b((v16i8)in_v, (v16i8)in_v); \
vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \
reg0_m = (v4i32)__msa_ilvr_h(zero_m, vec0_m); \
reg1_m = (v4i32)__msa_ilvl_h(zero_m, vec0_m); \
reg0_m *= vec_yg; \
reg1_m *= vec_yg; \
reg0_m = __msa_srai_w(reg0_m, 16); \
reg1_m = __msa_srai_w(reg1_m, 16); \
reg4_m = reg0_m + br; \
reg5_m = reg1_m + br; \
reg2_m = reg0_m + bg; \
reg3_m = reg1_m + bg; \
reg0_m += bb; \
reg1_m += bb; \
vec0_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_u); \
reg6_m = (v4i32)__msa_ilvr_h(zero_m, (v8i16)vec0_m); \
reg7_m = (v4i32)__msa_ilvl_h(zero_m, (v8i16)vec0_m); \
vec0_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_v); \
reg8_m = (v4i32)__msa_ilvr_h(zero_m, (v8i16)vec0_m); \
reg9_m = (v4i32)__msa_ilvl_h(zero_m, (v8i16)vec0_m); \
reg0_m -= reg6_m * ub; \
reg1_m -= reg7_m * ub; \
reg2_m -= reg6_m * ug; \
reg3_m -= reg7_m * ug; \
reg4_m -= reg8_m * vr; \
reg5_m -= reg9_m * vr; \
reg2_m -= reg8_m * vg; \
reg3_m -= reg9_m * vg; \
reg0_m = __msa_srai_w(reg0_m, 6); \
reg1_m = __msa_srai_w(reg1_m, 6); \
reg2_m = __msa_srai_w(reg2_m, 6); \
reg3_m = __msa_srai_w(reg3_m, 6); \
reg4_m = __msa_srai_w(reg4_m, 6); \
reg5_m = __msa_srai_w(reg5_m, 6); \
reg0_m = __msa_maxi_s_w(reg0_m, 0); \
reg1_m = __msa_maxi_s_w(reg1_m, 0); \
reg2_m = __msa_maxi_s_w(reg2_m, 0); \
reg3_m = __msa_maxi_s_w(reg3_m, 0); \
reg4_m = __msa_maxi_s_w(reg4_m, 0); \
reg5_m = __msa_maxi_s_w(reg5_m, 0); \
reg0_m = __msa_min_s_w(reg0_m, max_val_m); \
reg1_m = __msa_min_s_w(reg1_m, max_val_m); \
reg2_m = __msa_min_s_w(reg2_m, max_val_m); \
reg3_m = __msa_min_s_w(reg3_m, max_val_m); \
reg4_m = __msa_min_s_w(reg4_m, max_val_m); \
reg5_m = __msa_min_s_w(reg5_m, max_val_m); \
out_b = __msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \
out_g = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
out_r = __msa_pckev_h((v8i16)reg5_m, (v8i16)reg4_m); \
}
// Pack and Store 8 ARGB values.
#define STOREARGB(in0, in1, in2, in3, pdst_argb) { \
v8i16 vec0_m, vec1_m; \
v16u8 dst0_m, dst1_m; \
vec0_m = (v8i16) __msa_ilvev_b((v16i8) in1, (v16i8) in0); \
vec1_m = (v8i16) __msa_ilvev_b((v16i8) in3, (v16i8) in2); \
dst0_m = (v16u8) __msa_ilvr_h(vec1_m, vec0_m); \
dst1_m = (v16u8) __msa_ilvl_h(vec1_m, vec0_m); \
ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \
}
#define STOREARGB(in0, in1, in2, in3, pdst_argb) \
{ \
v8i16 vec0_m, vec1_m; \
v16u8 dst0_m, dst1_m; \
vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \
dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \
ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \
}
void MirrorRow_MSA(const uint8* src, uint8* dst, int width) {
int x;
v16u8 src0, src1, src2, src3;
v16u8 dst0, dst1, dst2, dst3;
v16i8 shuffler = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
src += width - 64;
for (x = 0; x < width; x += 64) {
@ -138,7 +141,7 @@ void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width) {
int x;
v16u8 src0, src1, src2, src3;
v16u8 dst0, dst1, dst2, dst3;
v16i8 shuffler = { 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 };
v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
src += width * 4 - 64;
for (x = 0; x < width; x += 16) {
@ -199,22 +202,25 @@ void I422ToUYVYRow_MSA(const uint8* src_y,
}
}
void I422ToARGBRow_MSA(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* rgb_buf,
const struct YuvConstants* yuvconstants, int width) {
void I422ToARGBRow_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
v16u8 src0, src1, src2;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v16u8 const_255 = (v16u8) __msa_ldi_b(255);
v16u8 const_255 = (v16u8)__msa_ldi_b(255);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug,
vec_vg, vec_bb, vec_bg, vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg,
vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2);
YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg, vec0, vec1, vec2);
STOREARGB(vec0, vec1, vec2, const_255, rgb_buf);
src_y += 8;
src_u += 4;
@ -223,22 +229,25 @@ void I422ToARGBRow_MSA(const uint8* src_y, const uint8* src_u,
}
}
void YUVTORGBARow_MSA(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* rgb_buf,
const struct YuvConstants* yuvconstants, int width) {
void YUVTORGBARow_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
v16u8 src0, src1, src2;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v16u8 const_255 = (v16u8) __msa_ldi_b(255);
v16u8 const_255 = (v16u8)__msa_ldi_b(255);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug,
vec_vg, vec_bb, vec_bg, vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg,
vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2);
YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg, vec0, vec1, vec2);
STOREARGB(const_255, vec0, vec1, vec2, rgb_buf);
src_y += 8;
src_u += 4;
@ -247,8 +256,10 @@ void YUVTORGBARow_MSA(const uint8* src_y, const uint8* src_u,
}
}
void I422AlphaToARGBRow_MSA(const uint8* src_y, const uint8* src_u,
const uint8* src_v, const uint8* src_a,
void I422AlphaToARGBRow_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
const uint8* src_a,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
@ -257,18 +268,18 @@ void I422AlphaToARGBRow_MSA(const uint8* src_y, const uint8* src_u,
v16u8 src0, src1, src2, src3;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v4i32 zero = { 0 };
v4i32 zero = {0};
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug,
vec_vg, vec_bb, vec_bg, vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
for (x = 0; x < width; x += 8) {
data_a = LD(src_a);
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src3 = (v16u8) __msa_insert_d((v2i64) zero, 0, data_a);
YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg,
vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2);
src3 = (v16u8) __msa_ilvr_b((v16i8) src3, (v16i8) src3);
src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg, vec0, vec1, vec2);
src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
STOREARGB(vec0, vec1, vec2, src3, rgb_buf);
src_y += 8;
src_u += 4;
@ -278,44 +289,47 @@ void I422AlphaToARGBRow_MSA(const uint8* src_y, const uint8* src_u,
}
}
void YUVTORGB24Row_MSA(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* rgb_buf,
const struct YuvConstants* yuvconstants, int32 width) {
void YUVTORGB24Row_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int32 width) {
int x;
int64 data_u, data_v;
v16u8 src0, src1, src2, src3, src4, src5, dst0, dst1, dst2;
v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v16u8 reg0, reg1, reg2, reg3;
v2i64 zero = { 0 };
v16i8 shuffler0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10 };
v16i8 shuffler1 = { 0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10 };
v16i8 shuffler2 =
{ 26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31 };
v2i64 zero = {0};
v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};
v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10,
11, 29, 12, 13, 30, 14, 15, 31};
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug,
vec_vg, vec_bb, vec_bg, vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
for (x = 0; x < width; x += 16) {
src0 = (v16u8) __msa_ld_b((v16u8*) src_y, 0);
src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0);
data_u = LD(src_u);
data_v = LD(src_v);
src1 = (v16u8) __msa_insert_d(zero, 0, data_u);
src2 = (v16u8) __msa_insert_d(zero, 0, data_v);
src3 = (v16u8) __msa_sldi_b((v16i8) src0, (v16i8) src0, 8);
src4 = (v16u8) __msa_sldi_b((v16i8) src1, (v16i8) src1, 4);
src5 = (v16u8) __msa_sldi_b((v16i8) src2, (v16i8) src2, 4);
YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg,
vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2);
YUVTORGB(src3, src4, src5, vec_ub, vec_vr, vec_ug, vec_vg,
vec_bb, vec_bg, vec_br, vec_yg, vec3, vec4, vec5);
reg0 = (v16u8) __msa_ilvev_b((v16i8) vec1, (v16i8) vec0);
reg2 = (v16u8) __msa_ilvev_b((v16i8) vec4, (v16i8) vec3);
reg3 = (v16u8) __msa_pckev_b((v16i8) vec5, (v16i8) vec2);
reg1 = (v16u8) __msa_sldi_b((v16i8) reg2, (v16i8) reg0, 11);
dst0 = (v16u8) __msa_vshf_b(shuffler0, (v16i8) reg3, (v16i8) reg0);
dst1 = (v16u8) __msa_vshf_b(shuffler1, (v16i8) reg3, (v16i8) reg1);
dst2 = (v16u8) __msa_vshf_b(shuffler2, (v16i8) reg3, (v16i8) reg2);
src1 = (v16u8)__msa_insert_d(zero, 0, data_u);
src2 = (v16u8)__msa_insert_d(zero, 0, data_v);
src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 4);
src5 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg, vec0, vec1, vec2);
YUVTORGB(src3, src4, src5, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg, vec3, vec4, vec5);
reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11);
dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0);
dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1);
dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2);
ST_UB2(dst0, dst1, rgb_buf, 16);
ST_UB(dst2, (rgb_buf + 32));
src_y += 16;
@ -326,28 +340,31 @@ void YUVTORGB24Row_MSA(const uint8* src_y, const uint8* src_u,
}
// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
void YUVTORGB565Row_MSA(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_rgb565,
const struct YuvConstants* yuvconstants, int width) {
void YUVTORGB565Row_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
int x;
v16u8 src0, src1, src2, dst0;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug,
vec_vg, vec_bb, vec_bg, vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg,
vec_bb, vec_bg, vec_br, vec_yg, vec0, vec2, vec1);
YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg, vec0, vec2, vec1);
vec0 = __msa_srai_h(vec0, 3);
vec1 = __msa_srai_h(vec1, 3);
vec2 = __msa_srai_h(vec2, 2);
vec1 = __msa_slli_h(vec1, 11);
vec2 = __msa_slli_h(vec2, 5);
vec0 |= vec1;
dst0 = (v16u8) (vec2 | vec0);
dst0 = (v16u8)(vec2 | vec0);
ST_UB(dst0, dst_rgb565);
src_y += 8;
src_u += 4;
@ -357,31 +374,34 @@ void YUVTORGB565Row_MSA(const uint8* src_y, const uint8* src_u,
}
// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
void I422ToARGB4444Row_MSA(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_argb4444,
const struct YuvConstants* yuvconstants, int width) {
void I422ToARGB4444Row_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width) {
int x;
v16u8 src0, src1, src2, dst0;
v8i16 vec0, vec1, vec2;
v8u16 reg0, reg1, reg2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v8u16 const_0xF000 = (v8u16) __msa_fill_h(0xF000);
v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug,
vec_vg, vec_bb, vec_bg, vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg,
vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2);
reg0 = (v8u16) __msa_srai_h(vec0, 4);
reg1 = (v8u16) __msa_srai_h(vec1, 4);
reg2 = (v8u16) __msa_srai_h(vec2, 4);
reg1 = (v8u16) __msa_slli_h((v8i16) reg1, 4);
reg2 = (v8u16) __msa_slli_h((v8i16) reg2, 8);
YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg, vec0, vec1, vec2);
reg0 = (v8u16)__msa_srai_h(vec0, 4);
reg1 = (v8u16)__msa_srai_h(vec1, 4);
reg2 = (v8u16)__msa_srai_h(vec2, 4);
reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4);
reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8);
reg1 |= const_0xF000;
reg0 |= reg2;
dst0 = (v16u8) (reg1 | reg0);
dst0 = (v16u8)(reg1 | reg0);
ST_UB(dst0, dst_argb4444);
src_y += 8;
src_u += 4;
@ -390,31 +410,34 @@ void I422ToARGB4444Row_MSA(const uint8* src_y, const uint8* src_u,
}
}
void I422ToARGB1555Row_MSA(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_argb1555,
const struct YuvConstants* yuvconstants, int width) {
void I422ToARGB1555Row_MSA(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width) {
int x;
v16u8 src0, src1, src2, dst0;
v8i16 vec0, vec1, vec2;
v8u16 reg0, reg1, reg2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v8u16 const_0x8000 = (v8u16) __msa_fill_h(0x8000);
v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug,
vec_vg, vec_bb, vec_bg, vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg,
vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2);
reg0 = (v8u16) __msa_srai_h(vec0, 3);
reg1 = (v8u16) __msa_srai_h(vec1, 3);
reg2 = (v8u16) __msa_srai_h(vec2, 3);
reg1 = (v8u16) __msa_slli_h((v8i16) reg1, 5);
reg2 = (v8u16) __msa_slli_h((v8i16) reg2, 10);
YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg, vec0, vec1, vec2);
reg0 = (v8u16)__msa_srai_h(vec0, 3);
reg1 = (v8u16)__msa_srai_h(vec1, 3);
reg2 = (v8u16)__msa_srai_h(vec2, 3);
reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);
reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);
reg1 |= const_0x8000;
reg0 |= reg2;
dst0 = (v16u8) (reg1 | reg0);
dst0 = (v16u8)(reg1 | reg0);
ST_UB(dst0, dst_argb1555);
src_y += 8;
src_u += 4;
@ -429,16 +452,19 @@ void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) {
for (x = 0; x < width; x += 32) {
LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
dst1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2);
dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
ST_UB2(dst0, dst1, dst_y, 16);
src_yuy2 += 64;
dst_y += 32;
}
}
void YUY2ToUVRow_MSA(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_u, uint8* dst_v, int width) {
void YUY2ToUVRow_MSA(const uint8* src_yuy2,
int src_stride_yuy2,
uint8* dst_u,
uint8* dst_v,
int width) {
const uint8* src_yuy2_next = src_yuy2 + src_stride_yuy2;
int x;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
@ -447,14 +473,14 @@ void YUY2ToUVRow_MSA(const uint8* src_yuy2, int src_stride_yuy2,
for (x = 0; x < width; x += 32) {
LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7);
src0 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
src1 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2);
src2 = (v16u8) __msa_pckod_b((v16i8) src5, (v16i8) src4);
src3 = (v16u8) __msa_pckod_b((v16i8) src7, (v16i8) src6);
src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
vec0 = __msa_aver_u_b(src0, src2);
vec1 = __msa_aver_u_b(src1, src3);
dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
dst1 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_u);
ST_UB(dst1, dst_v);
src_yuy2 += 64;
@ -464,17 +490,19 @@ void YUY2ToUVRow_MSA(const uint8* src_yuy2, int src_stride_yuy2,
}
}
void YUY2ToUV422Row_MSA(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
void YUY2ToUV422Row_MSA(const uint8* src_yuy2,
uint8* dst_u,
uint8* dst_v,
int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 32) {
LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
src0 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
src1 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2);
dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
dst1 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
ST_UB(dst0, dst_u);
ST_UB(dst1, dst_v);
src_yuy2 += 64;
@ -489,17 +517,20 @@ void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) {
for (x = 0; x < width; x += 32) {
LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
dst0 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
dst1 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2);
dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
ST_UB2(dst0, dst1, dst_y, 16);
src_uyvy += 64;
dst_y += 32;
}
}
void UYVYToUVRow_MSA(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_u, uint8* dst_v, int width) {
const uint8 *src_uyvy_next = src_uyvy + src_stride_uyvy;
void UYVYToUVRow_MSA(const uint8* src_uyvy,
int src_stride_uyvy,
uint8* dst_u,
uint8* dst_v,
int width) {
const uint8* src_uyvy_next = src_uyvy + src_stride_uyvy;
int x;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 vec0, vec1, dst0, dst1;
@ -507,14 +538,14 @@ void UYVYToUVRow_MSA(const uint8* src_uyvy, int src_stride_uyvy,
for (x = 0; x < width; x += 32) {
LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7);
src0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
src1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2);
src2 = (v16u8) __msa_pckev_b((v16i8) src5, (v16i8) src4);
src3 = (v16u8) __msa_pckev_b((v16i8) src7, (v16i8) src6);
src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
vec0 = __msa_aver_u_b(src0, src2);
vec1 = __msa_aver_u_b(src1, src3);
dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
dst1 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_u);
ST_UB(dst1, dst_v);
src_uyvy += 64;
@ -524,17 +555,19 @@ void UYVYToUVRow_MSA(const uint8* src_uyvy, int src_stride_uyvy,
}
}
void UYVYToUV422Row_MSA(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
void UYVYToUV422Row_MSA(const uint8* src_uyvy,
uint8* dst_u,
uint8* dst_v,
int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 32) {
LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
src0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
src1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2);
dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
dst1 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
ST_UB(dst0, dst_u);
ST_UB(dst1, dst_v);
src_uyvy += 64;
@ -547,27 +580,27 @@ void ARGBToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
v16i8 zero = { 0 };
v8u16 const_0x19 = (v8u16) __msa_ldi_h(0x19);
v8u16 const_0x81 = (v8u16) __msa_ldi_h(0x81);
v8u16 const_0x42 = (v8u16) __msa_ldi_h(0x42);
v8u16 const_0x1080 = (v8u16) __msa_fill_h(0x1080);
v16i8 zero = {0};
v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
for (x = 0; x < width; x += 16) {
src0 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 0);
src1 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 16);
src2 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 32);
src3 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 48);
vec0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
vec1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2);
vec2 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
vec3 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2);
reg0 = (v8u16) __msa_ilvev_b(zero, (v16i8) vec0);
reg1 = (v8u16) __msa_ilvev_b(zero, (v16i8) vec1);
reg2 = (v8u16) __msa_ilvev_b(zero, (v16i8) vec2);
reg3 = (v8u16) __msa_ilvev_b(zero, (v16i8) vec3);
reg4 = (v8u16) __msa_ilvod_b(zero, (v16i8) vec0);
reg5 = (v8u16) __msa_ilvod_b(zero, (v16i8) vec1);
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0);
reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1);
reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2);
reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3);
reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0);
reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1);
reg0 *= const_0x19;
reg1 *= const_0x19;
reg2 *= const_0x81;
@ -580,93 +613,96 @@ void ARGBToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
reg1 += reg5;
reg0 += const_0x1080;
reg1 += const_0x1080;
reg0 = (v8u16) __msa_srai_h((v8i16) reg0, 8);
reg1 = (v8u16) __msa_srai_h((v8i16) reg1, 8);
dst0 = (v16u8) __msa_pckev_b((v16i8) reg1, (v16i8) reg0);
reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
ST_UB(dst0, dst_y);
src_argb0 += 64;
dst_y += 16;
}
}
void ARGBToUVRow_MSA(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
void ARGBToUVRow_MSA(const uint8* src_argb0,
int src_stride_argb,
uint8* dst_u,
uint8* dst_v,
int width) {
int x;
const uint8* src_argb0_next = src_argb0 + src_stride_argb;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
v16u8 dst0, dst1;
v8u16 const_0x70 = (v8u16) __msa_ldi_h(0x70);
v8u16 const_0x4A = (v8u16) __msa_ldi_h(0x4A);
v8u16 const_0x26 = (v8u16) __msa_ldi_h(0x26);
v8u16 const_0x5E = (v8u16) __msa_ldi_h(0x5E);
v8u16 const_0x12 = (v8u16) __msa_ldi_h(0x12);
v8u16 const_0x8080 = (v8u16) __msa_fill_h(0x8080);
v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
for (x = 0; x < width; x += 32) {
src0 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 0);
src1 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 16);
src2 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 32);
src3 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 48);
src4 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 64);
src5 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 80);
src6 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 96);
src7 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 112);
vec0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
vec1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2);
vec2 = (v16u8) __msa_pckev_b((v16i8) src5, (v16i8) src4);
vec3 = (v16u8) __msa_pckev_b((v16i8) src7, (v16i8) src6);
vec4 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
vec5 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2);
vec6 = (v16u8) __msa_pckod_b((v16i8) src5, (v16i8) src4);
vec7 = (v16u8) __msa_pckod_b((v16i8) src7, (v16i8) src6);
vec8 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
vec9 = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
vec4 = (v16u8) __msa_pckev_b((v16i8) vec5, (v16i8) vec4);
vec5 = (v16u8) __msa_pckev_b((v16i8) vec7, (v16i8) vec6);
vec0 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0);
vec1 = (v16u8) __msa_pckod_b((v16i8) vec3, (v16i8) vec2);
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64);
src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80);
src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96);
src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112);
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
reg0 = __msa_hadd_u_h(vec8, vec8);
reg1 = __msa_hadd_u_h(vec9, vec9);
reg2 = __msa_hadd_u_h(vec4, vec4);
reg3 = __msa_hadd_u_h(vec5, vec5);
reg4 = __msa_hadd_u_h(vec0, vec0);
reg5 = __msa_hadd_u_h(vec1, vec1);
src0 = (v16u8) __msa_ld_b((v16u8*) src_argb0_next, 0);
src1 = (v16u8) __msa_ld_b((v16u8*) src_argb0_next, 16);
src2 = (v16u8) __msa_ld_b((v16u8*) src_argb0_next, 32);
src3 = (v16u8) __msa_ld_b((v16u8*) src_argb0_next, 48);
src4 = (v16u8) __msa_ld_b((v16u8*) src_argb0_next, 64);
src5 = (v16u8) __msa_ld_b((v16u8*) src_argb0_next, 80);
src6 = (v16u8) __msa_ld_b((v16u8*) src_argb0_next, 96);
src7 = (v16u8) __msa_ld_b((v16u8*) src_argb0_next, 112);
vec0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
vec1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2);
vec2 = (v16u8) __msa_pckev_b((v16i8) src5, (v16i8) src4);
vec3 = (v16u8) __msa_pckev_b((v16i8) src7, (v16i8) src6);
vec4 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
vec5 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2);
vec6 = (v16u8) __msa_pckod_b((v16i8) src5, (v16i8) src4);
vec7 = (v16u8) __msa_pckod_b((v16i8) src7, (v16i8) src6);
vec8 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
vec9 = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
vec4 = (v16u8) __msa_pckev_b((v16i8) vec5, (v16i8) vec4);
vec5 = (v16u8) __msa_pckev_b((v16i8) vec7, (v16i8) vec6);
vec0 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0);
vec1 = (v16u8) __msa_pckod_b((v16i8) vec3, (v16i8) vec2);
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0);
src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16);
src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32);
src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48);
src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64);
src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80);
src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96);
src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112);
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
reg0 += __msa_hadd_u_h(vec8, vec8);
reg1 += __msa_hadd_u_h(vec9, vec9);
reg2 += __msa_hadd_u_h(vec4, vec4);
reg3 += __msa_hadd_u_h(vec5, vec5);
reg4 += __msa_hadd_u_h(vec0, vec0);
reg5 += __msa_hadd_u_h(vec1, vec1);
reg0 = (v8u16) __msa_srai_h((v8i16) reg0, 2);
reg1 = (v8u16) __msa_srai_h((v8i16) reg1, 2);
reg2 = (v8u16) __msa_srai_h((v8i16) reg2, 2);
reg3 = (v8u16) __msa_srai_h((v8i16) reg3, 2);
reg4 = (v8u16) __msa_srai_h((v8i16) reg4, 2);
reg5 = (v8u16) __msa_srai_h((v8i16) reg5, 2);
reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);
reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);
reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);
reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);
reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);
reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);
reg6 = reg0 * const_0x70;
reg7 = reg1 * const_0x70;
reg8 = reg2 * const_0x4A;
@ -689,12 +725,12 @@ void ARGBToUVRow_MSA(const uint8* src_argb0, int src_stride_argb,
reg7 -= reg9;
reg4 -= reg2;
reg5 -= reg3;
reg6 = (v8u16) __msa_srai_h((v8i16) reg6, 8);
reg7 = (v8u16) __msa_srai_h((v8i16) reg7, 8);
reg4 = (v8u16) __msa_srai_h((v8i16) reg4, 8);
reg5 = (v8u16) __msa_srai_h((v8i16) reg5, 8);
dst0 = (v16u8) __msa_pckev_b((v16i8) reg7, (v16i8) reg6);
dst1 = (v16u8) __msa_pckev_b((v16i8) reg5, (v16i8) reg4);
reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8);
reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8);
reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8);
reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8);
dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6);
dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
ST_UB(dst0, dst_u);
ST_UB(dst1, dst_v);
src_argb0 += 128;
@ -704,7 +740,8 @@ void ARGBToUVRow_MSA(const uint8* src_argb0, int src_stride_argb,
}
}
void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444, uint8* dst_argb,
void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444,
uint8* dst_argb,
int width) {
int x;
v16u8 src0, src1;
@ -712,20 +749,20 @@ void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444, uint8* dst_argb,
v16u8 dst0, dst1, dst2, dst3;
for (x = 0; x < width; x += 16) {
src0 = (v16u8) __msa_ld_b((v16u8*) src_argb4444, 0);
src1 = (v16u8) __msa_ld_b((v16u8*) src_argb4444, 16);
vec0 = (v8u16) __msa_andi_b(src0, 0x0F);
vec1 = (v8u16) __msa_andi_b(src1, 0x0F);
vec2 = (v8u16) __msa_andi_b(src0, 0xF0);
vec3 = (v8u16) __msa_andi_b(src1, 0xF0);
vec0 |= (v8u16) __msa_slli_b((v16i8) vec0, 4);
vec1 |= (v8u16) __msa_slli_b((v16i8) vec1, 4);
vec2 |= (v8u16) __msa_srli_b((v16i8) vec2, 4);
vec3 |= (v8u16) __msa_srli_b((v16i8) vec3, 4);
dst0 = (v16u8) __msa_ilvr_b((v16i8) vec2, (v16i8) vec0);
dst1 = (v16u8) __msa_ilvl_b((v16i8) vec2, (v16i8) vec0);
dst2 = (v16u8) __msa_ilvr_b((v16i8) vec3, (v16i8) vec1);
dst3 = (v16u8) __msa_ilvl_b((v16i8) vec3, (v16i8) vec1);
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0);
src1 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 16);
vec0 = (v8u16)__msa_andi_b(src0, 0x0F);
vec1 = (v8u16)__msa_andi_b(src1, 0x0F);
vec2 = (v8u16)__msa_andi_b(src0, 0xF0);
vec3 = (v8u16)__msa_andi_b(src1, 0xF0);
vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4);
vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4);
vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4);
vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4);
dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
src_argb4444 += 32;
dst_argb += 64;

View File

@ -22,18 +22,18 @@ extern "C" {
!defined(__aarch64__)
// Read 8 Y, 4 U and 4 V from 422
#define READYUV422 \
MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
#define READYUV422 \
MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
MEMACCESS(1) \
"vld1.32 {d2[0]}, [%1]! \n" \
MEMACCESS(2) \
"vld1.32 {d2[1]}, [%2]! \n"
// Read 8 Y, 8 U and 8 V from 444
#define READYUV444 \
MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
#define READYUV444 \
MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
MEMACCESS(1) \
"vld1.8 {d2}, [%1]! \n" \
MEMACCESS(2) \
@ -42,15 +42,15 @@ extern "C" {
"vrshrn.u16 d2, q1, #1 \n"
// Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400 \
MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
"vmov.u8 d2, #128 \n"
#define READYUV400 \
MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
"vmov.u8 d2, #128 \n"
// Read 8 Y and 4 UV from NV12
#define READNV12 \
MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
#define READNV12 \
MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
MEMACCESS(1) \
"vld1.8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
@ -58,9 +58,9 @@ extern "C" {
"vtrn.u32 d2, d3 \n"
// Read 8 Y and 4 VU from NV21
#define READNV21 \
MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
#define READNV21 \
MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
MEMACCESS(1) \
"vld1.8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
@ -68,25 +68,25 @@ extern "C" {
"vtrn.u32 d2, d3 \n"
// Read 8 YUY2
#define READYUY2 \
MEMACCESS(0) \
"vld2.8 {d0, d2}, [%0]! \n" \
"vmov.u8 d3, d2 \n" \
"vuzp.u8 d2, d3 \n" \
"vtrn.u32 d2, d3 \n"
#define READYUY2 \
MEMACCESS(0) \
"vld2.8 {d0, d2}, [%0]! \n" \
"vmov.u8 d3, d2 \n" \
"vuzp.u8 d2, d3 \n" \
"vtrn.u32 d2, d3 \n"
// Read 8 UYVY
#define READUYVY \
MEMACCESS(0) \
"vld2.8 {d2, d3}, [%0]! \n" \
"vmov.u8 d0, d3 \n" \
"vmov.u8 d3, d2 \n" \
"vuzp.u8 d2, d3 \n" \
"vtrn.u32 d2, d3 \n"
#define READUYVY \
MEMACCESS(0) \
"vld2.8 {d2, d3}, [%0]! \n" \
"vmov.u8 d0, d3 \n" \
"vmov.u8 d3, d2 \n" \
"vuzp.u8 d2, d3 \n" \
"vtrn.u32 d2, d3 \n"
#define YUVTORGB_SETUP \
MEMACCESS([kUVToRB]) \
"vld1.8 {d24}, [%[kUVToRB]] \n" \
#define YUVTORGB_SETUP \
MEMACCESS([kUVToRB]) \
"vld1.8 {d24}, [%[kUVToRB]] \n" \
MEMACCESS([kUVToG]) \
"vld1.8 {d25}, [%[kUVToG]] \n" \
MEMACCESS([kUVBiasBGR]) \
@ -98,32 +98,32 @@ extern "C" {
MEMACCESS([kYToRgb]) \
"vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
#define YUVTORGB \
"vmull.u8 q8, d2, d24 \n" /* u/v B/R component */\
"vmull.u8 q9, d2, d25 \n" /* u/v G component */\
"vmovl.u8 q0, d0 \n" /* Y */\
"vmovl.s16 q10, d1 \n" \
"vmovl.s16 q0, d0 \n" \
"vmul.s32 q10, q10, q15 \n" \
"vmul.s32 q0, q0, q15 \n" \
"vqshrun.s32 d0, q0, #16 \n" \
"vqshrun.s32 d1, q10, #16 \n" /* Y */\
"vadd.s16 d18, d19 \n" \
"vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */\
"vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */\
"vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/\
"vaddw.u16 q1, q1, d16 \n" \
"vaddw.u16 q10, q10, d17 \n" \
"vaddw.u16 q3, q3, d18 \n" \
"vqadd.s16 q8, q0, q13 \n" /* B */ \
"vqadd.s16 q9, q0, q14 \n" /* R */ \
"vqadd.s16 q0, q0, q4 \n" /* G */ \
"vqadd.s16 q8, q8, q1 \n" /* B */ \
"vqadd.s16 q9, q9, q10 \n" /* R */ \
"vqsub.s16 q0, q0, q3 \n" /* G */ \
"vqshrun.s16 d20, q8, #6 \n" /* B */ \
"vqshrun.s16 d22, q9, #6 \n" /* R */ \
"vqshrun.s16 d21, q0, #6 \n" /* G */
#define YUVTORGB \
"vmull.u8 q8, d2, d24 \n" /* u/v B/R component */ \
"vmull.u8 q9, d2, d25 \n" /* u/v G component */ \
"vmovl.u8 q0, d0 \n" /* Y */ \
"vmovl.s16 q10, d1 \n" \
"vmovl.s16 q0, d0 \n" \
"vmul.s32 q10, q10, q15 \n" \
"vmul.s32 q0, q0, q15 \n" \
"vqshrun.s32 d0, q0, #16 \n" \
"vqshrun.s32 d1, q10, #16 \n" /* Y */ \
"vadd.s16 d18, d19 \n" \
"vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */ \
"vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */ \
"vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/ \
"vaddw.u16 q1, q1, d16 \n" \
"vaddw.u16 q10, q10, d17 \n" \
"vaddw.u16 q3, q3, d18 \n" \
"vqadd.s16 q8, q0, q13 \n" /* B */ \
"vqadd.s16 q9, q0, q14 \n" /* R */ \
"vqadd.s16 q0, q0, q4 \n" /* G */ \
"vqadd.s16 q8, q8, q1 \n" /* B */ \
"vqadd.s16 q9, q9, q10 \n" /* R */ \
"vqsub.s16 q0, q0, q3 \n" /* G */ \
"vqshrun.s16 d20, q8, #6 \n" /* B */ \
"vqshrun.s16 d22, q9, #6 \n" /* R */ \
"vqshrun.s16 d21, q0, #6 \n" /* G */
void I444ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u,
@ -277,12 +277,12 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
);
}
#define ARGBTORGB565 \
"vshll.u8 q0, d22, #8 \n" /* R */ \
"vshll.u8 q8, d21, #8 \n" /* G */ \
"vshll.u8 q9, d20, #8 \n" /* B */ \
"vsri.16 q0, q8, #5 \n" /* RG */ \
"vsri.16 q0, q9, #11 \n" /* RGB */
#define ARGBTORGB565 \
"vshll.u8 q0, d22, #8 \n" /* R */ \
"vshll.u8 q8, d21, #8 \n" /* G */ \
"vshll.u8 q9, d20, #8 \n" /* B */ \
"vsri.16 q0, q8, #5 \n" /* RG */ \
"vsri.16 q0, q9, #11 \n" /* RGB */
void I422ToRGB565Row_NEON(const uint8* src_y,
const uint8* src_u,
@ -314,14 +314,14 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
);
}
#define ARGBTOARGB1555 \
"vshll.u8 q0, d23, #8 \n" /* A */ \
"vshll.u8 q8, d22, #8 \n" /* R */ \
"vshll.u8 q9, d21, #8 \n" /* G */ \
"vshll.u8 q10, d20, #8 \n" /* B */ \
"vsri.16 q0, q8, #1 \n" /* AR */ \
"vsri.16 q0, q9, #6 \n" /* ARG */ \
"vsri.16 q0, q10, #11 \n" /* ARGB */
#define ARGBTOARGB1555 \
"vshll.u8 q0, d23, #8 \n" /* A */ \
"vshll.u8 q8, d22, #8 \n" /* R */ \
"vshll.u8 q9, d21, #8 \n" /* G */ \
"vshll.u8 q10, d20, #8 \n" /* B */ \
"vsri.16 q0, q8, #1 \n" /* AR */ \
"vsri.16 q0, q9, #6 \n" /* ARG */ \
"vsri.16 q0, q10, #11 \n" /* ARGB */
void I422ToARGB1555Row_NEON(const uint8* src_y,
const uint8* src_u,
@ -354,14 +354,14 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
);
}
#define ARGBTOARGB4444 \
"vshr.u8 d20, d20, #4 \n" /* B */ \
"vbic.32 d21, d21, d4 \n" /* G */ \
"vshr.u8 d22, d22, #4 \n" /* R */ \
"vbic.32 d23, d23, d4 \n" /* A */ \
"vorr d0, d20, d21 \n" /* BG */ \
"vorr d1, d22, d23 \n" /* RA */ \
"vzip.u8 d0, d1 \n" /* BGRA */
#define ARGBTOARGB4444 \
"vshr.u8 d20, d20, #4 \n" /* B */ \
"vbic.32 d21, d21, d4 \n" /* G */ \
"vshr.u8 d22, d22, #4 \n" /* R */ \
"vbic.32 d23, d23, d4 \n" /* A */ \
"vorr d0, d20, d21 \n" /* BG */ \
"vorr d1, d22, d23 \n" /* RA */ \
"vzip.u8 d0, d1 \n" /* BGRA */
void I422ToARGB4444Row_NEON(const uint8* src_y,
const uint8* src_u,
@ -395,9 +395,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
);
}
void I400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
asm volatile (
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
@ -420,9 +418,7 @@ void I400ToARGBRow_NEON(const uint8* src_y,
);
}
void J400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
asm volatile (
"vmov.u8 d23, #255 \n"
"1: \n"
@ -579,7 +575,9 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
}
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
void SplitUVRow_NEON(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"1: \n"
@ -601,7 +599,9 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
}
// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
void MergeUVRow_NEON(const uint8* src_u,
const uint8* src_v,
uint8* dst_uv,
int width) {
asm volatile (
"1: \n"
@ -698,7 +698,9 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
);
}
void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
void MirrorUVRow_NEON(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
// Start at end of source row.
@ -805,17 +807,17 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
);
}
#define RGB565TOARGB \
"vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
"vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
"vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
"vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
"vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
"vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
"vorr.u8 d0, d0, d4 \n" /* B */ \
"vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
"vorr.u8 d2, d1, d5 \n" /* R */ \
"vorr.u8 d1, d4, d6 \n" /* G */
#define RGB565TOARGB \
"vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
"vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
"vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
"vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
"vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
"vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
"vorr.u8 d0, d0, d4 \n" /* B */ \
"vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
"vorr.u8 d2, d1, d5 \n" /* R */ \
"vorr.u8 d1, d4, d6 \n" /* G */
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
asm volatile (
@ -836,34 +838,35 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
);
}
#define ARGB1555TOARGB \
"vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
"vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
"vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
"vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
"vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
"vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
"vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
"vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
"vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
"vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
"vorr.u8 q1, q1, q3 \n" /* R,A */ \
"vorr.u8 q0, q0, q2 \n" /* B,G */ \
#define ARGB1555TOARGB \
"vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
"vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
"vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
"vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
"vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
"vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
"vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
"vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
"vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
"vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
"vorr.u8 q1, q1, q3 \n" /* R,A */ \
"vorr.u8 q0, q0, q2 \n" /* B,G */
// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
#define RGB555TOARGB \
"vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
"vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
"vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
"vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
"vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
"vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
"vorr.u8 d0, d0, d4 \n" /* B */ \
"vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
"vorr.u8 d2, d1, d5 \n" /* R */ \
"vorr.u8 d1, d4, d6 \n" /* G */
#define RGB555TOARGB \
"vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
"vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
"vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
"vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
"vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
"vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
"vorr.u8 d0, d0, d4 \n" /* B */ \
"vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
"vorr.u8 d2, d1, d5 \n" /* R */ \
"vorr.u8 d1, d4, d6 \n" /* G */
void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
uint8* dst_argb,
int width) {
asm volatile (
"vmov.u8 d3, #255 \n" // Alpha
@ -883,17 +886,18 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
);
}
#define ARGB4444TOARGB \
"vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
"vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
"vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
"vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
"vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
"vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
"vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
"vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
#define ARGB4444TOARGB \
"vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
"vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
"vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
"vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
"vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
"vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
"vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
"vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
uint8* dst_argb,
int width) {
asm volatile (
"vmov.u8 d3, #255 \n" // Alpha
@ -982,7 +986,9 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
);
}
void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"1: \n"
@ -1003,7 +1009,9 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
);
}
void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
void UYVYToUV422Row_NEON(const uint8* src_uyvy,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"1: \n"
@ -1024,8 +1032,11 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
);
}
void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int width) {
void YUY2ToUVRow_NEON(const uint8* src_yuy2,
int stride_yuy2,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // stride + src_yuy2
"1: \n"
@ -1051,8 +1062,11 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
);
}
void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int width) {
void UYVYToUVRow_NEON(const uint8* src_uyvy,
int stride_uyvy,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // stride + src_uyvy
"1: \n"
@ -1079,8 +1093,10 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
}
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int width) {
void ARGBShuffleRow_NEON(const uint8* src_argb,
uint8* dst_argb,
const uint8* shuffler,
int width) {
asm volatile (
MEMACCESS(3)
"vld1.8 {q2}, [%3] \n" // shuffler
@ -1104,7 +1120,8 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
void I422ToYUY2Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_yuy2, int width) {
uint8* dst_yuy2,
int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@ -1130,7 +1147,8 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
void I422ToUYVYRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_uyvy, int width) {
uint8* dst_uyvy,
int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@ -1171,8 +1189,10 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
);
}
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width) {
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
uint8* dst_rgb,
const uint32 dither4,
int width) {
asm volatile (
"vdup.32 d2, %2 \n" // dither4
"1: \n"
@ -1194,7 +1214,8 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
);
}
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
void ARGBToARGB1555Row_NEON(const uint8* src_argb,
uint8* dst_argb1555,
int width) {
asm volatile (
"1: \n"
@ -1213,7 +1234,8 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
);
}
void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
void ARGBToARGB4444Row_NEON(const uint8* src_argb,
uint8* dst_argb4444,
int width) {
asm volatile (
"vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
@ -1302,7 +1324,9 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
}
// 8x1 pixels.
void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
void ARGBToUV444Row_NEON(const uint8* src_argb,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient
@ -1343,21 +1367,30 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
#define RGBTOUV(QB, QG, QR) \
"vmul.s16 q8, " #QB ", q10 \n" /* B */ \
"vmls.s16 q8, " #QG ", q11 \n" /* G */ \
"vmls.s16 q8, " #QR ", q12 \n" /* R */ \
"vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
"vmul.s16 q9, " #QR ", q10 \n" /* R */ \
"vmls.s16 q9, " #QG ", q14 \n" /* G */ \
"vmls.s16 q9, " #QB ", q13 \n" /* B */ \
"vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
"vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
"vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
#define RGBTOUV(QB, QG, QR) \
"vmul.s16 q8, " #QB \
", q10 \n" /* B */ \
"vmls.s16 q8, " #QG \
", q11 \n" /* G */ \
"vmls.s16 q8, " #QR \
", q12 \n" /* R */ \
"vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
"vmul.s16 q9, " #QR \
", q10 \n" /* R */ \
"vmls.s16 q9, " #QG \
", q14 \n" /* G */ \
"vmls.s16 q9, " #QB \
", q13 \n" /* B */ \
"vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
"vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
"vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
void ARGBToUVRow_NEON(const uint8* src_argb,
int src_stride_argb,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@ -1405,8 +1438,11 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
}
// TODO(fbarchard): Subsample match C code.
void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
void ARGBToUVJRow_NEON(const uint8* src_argb,
int src_stride_argb,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
@ -1453,8 +1489,11 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
);
}
void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_u, uint8* dst_v, int width) {
void BGRAToUVRow_NEON(const uint8* src_bgra,
int src_stride_bgra,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_bgra
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@ -1501,8 +1540,11 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
);
}
void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
uint8* dst_u, uint8* dst_v, int width) {
void ABGRToUVRow_NEON(const uint8* src_abgr,
int src_stride_abgr,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_abgr
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@ -1549,8 +1591,11 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
);
}
void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
uint8* dst_u, uint8* dst_v, int width) {
void RGBAToUVRow_NEON(const uint8* src_rgba,
int src_stride_rgba,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_rgba
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@ -1597,8 +1642,11 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
);
}
void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
uint8* dst_u, uint8* dst_v, int width) {
void RGB24ToUVRow_NEON(const uint8* src_rgb24,
int src_stride_rgb24,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_rgb24
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@ -1645,8 +1693,11 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
);
}
void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
uint8* dst_u, uint8* dst_v, int width) {
void RAWToUVRow_NEON(const uint8* src_raw,
int src_stride_raw,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_raw
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@ -1694,8 +1745,11 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
uint8* dst_u, uint8* dst_v, int width) {
void RGB565ToUVRow_NEON(const uint8* src_rgb565,
int src_stride_rgb565,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@ -1763,8 +1817,11 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
uint8* dst_u, uint8* dst_v, int width) {
void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
int src_stride_argb1555,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@ -1832,8 +1889,11 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
uint8* dst_u, uint8* dst_v, int width) {
void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
int src_stride_argb4444,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@ -2113,8 +2173,10 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
// Bilinear filter 16x2 -> 16x1
void InterpolateRow_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
const uint8* src_ptr,
ptrdiff_t src_stride,
int dst_width,
int source_y_fraction) {
int y1_fraction = source_y_fraction;
asm volatile (
"cmp %4, #0 \n"
@ -2178,8 +2240,10 @@ void InterpolateRow_NEON(uint8* dst_ptr,
}
// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
void ARGBBlendRow_NEON(const uint8* src_argb0,
const uint8* src_argb1,
uint8* dst_argb,
int width) {
asm volatile (
"subs %3, #8 \n"
"blt 89f \n"
@ -2269,8 +2333,11 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
// Quantize 8 ARGB pixels (32 bytes).
// dst = (dst * scale >> 16) * interval_size + interval_offset;
void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
int interval_offset, int width) {
void ARGBQuantizeRow_NEON(uint8* dst_argb,
int scale,
int interval_size,
int interval_offset,
int width) {
asm volatile (
"vdup.u16 q8, %2 \n"
"vshr.u16 q8, q8, #1 \n" // scale >>= 1
@ -2312,7 +2379,9 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
// Shade 8 pixels at a time by specified value.
// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
void ARGBShadeRow_NEON(const uint8* src_argb,
uint8* dst_argb,
int width,
uint32 value) {
asm volatile (
"vdup.u32 q0, %3 \n" // duplicate scale value.
@ -2421,8 +2490,10 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
// needs to saturate. Consider doing a non-saturating version.
void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
const int8* matrix_argb, int width) {
void ARGBColorMatrixRow_NEON(const uint8* src_argb,
uint8* dst_argb,
const int8* matrix_argb,
int width) {
asm volatile (
MEMACCESS(3)
"vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
@ -2482,8 +2553,10 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
}
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
void ARGBMultiplyRow_NEON(const uint8* src_argb0,
const uint8* src_argb1,
uint8* dst_argb,
int width) {
asm volatile (
// 8 pixel loop.
"1: \n"
@ -2514,8 +2587,10 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
}
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
void ARGBAddRow_NEON(const uint8* src_argb0,
const uint8* src_argb1,
uint8* dst_argb,
int width) {
asm volatile (
// 8 pixel loop.
"1: \n"
@ -2540,8 +2615,10 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
}
// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
void ARGBSubtractRow_NEON(const uint8* src_argb0,
const uint8* src_argb1,
uint8* dst_argb,
int width) {
asm volatile (
// 8 pixel loop.
"1: \n"
@ -2570,8 +2647,10 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// R = Sobel
// G = Sobel
// B = Sobel
void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
void SobelRow_NEON(const uint8* src_sobelx,
const uint8* src_sobely,
uint8* dst_argb,
int width) {
asm volatile (
"vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
@ -2597,8 +2676,10 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
}
// Adds Sobel X and Sobel Y and stores Sobel into plane.
void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_y, int width) {
void SobelToPlaneRow_NEON(const uint8* src_sobelx,
const uint8* src_sobely,
uint8* dst_y,
int width) {
asm volatile (
// 16 pixel loop.
"1: \n"
@ -2625,8 +2706,10 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
// R = Sobel X
// G = Sobel
// B = Sobel Y
void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
void SobelXYRow_NEON(const uint8* src_sobelx,
const uint8* src_sobely,
uint8* dst_argb,
int width) {
asm volatile (
"vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
@ -2653,8 +2736,11 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
// -1 0 1
// -2 0 2
// -1 0 1
void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobelx, int width) {
void SobelXRow_NEON(const uint8* src_y0,
const uint8* src_y1,
const uint8* src_y2,
uint8* dst_sobelx,
int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@ -2696,8 +2782,10 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
// -1 -2 -1
// 0 0 0
// 1 2 1
void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width) {
void SobelYRow_NEON(const uint8* src_y0,
const uint8* src_y1,
uint8* dst_sobely,
int width) {
asm volatile (
"1: \n"
MEMACCESS(0)

View File

@ -19,18 +19,18 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
// Read 8 Y, 4 U and 4 V from 422
#define READYUV422 \
MEMACCESS(0) \
"ld1 {v0.8b}, [%0], #8 \n" \
#define READYUV422 \
MEMACCESS(0) \
"ld1 {v0.8b}, [%0], #8 \n" \
MEMACCESS(1) \
"ld1 {v1.s}[0], [%1], #4 \n" \
MEMACCESS(2) \
"ld1 {v1.s}[1], [%2], #4 \n"
// Read 8 Y, 8 U and 8 V from 444
#define READYUV444 \
MEMACCESS(0) \
"ld1 {v0.8b}, [%0], #8 \n" \
#define READYUV444 \
MEMACCESS(0) \
"ld1 {v0.8b}, [%0], #8 \n" \
MEMACCESS(1) \
"ld1 {v1.d}[0], [%1], #8 \n" \
MEMACCESS(2) \
@ -39,15 +39,15 @@ extern "C" {
"rshrn v1.8b, v1.8h, #1 \n"
// Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400 \
MEMACCESS(0) \
"ld1 {v0.8b}, [%0], #8 \n" \
"movi v1.8b , #128 \n"
#define READYUV400 \
MEMACCESS(0) \
"ld1 {v0.8b}, [%0], #8 \n" \
"movi v1.8b , #128 \n"
// Read 8 Y and 4 UV from NV12
#define READNV12 \
MEMACCESS(0) \
"ld1 {v0.8b}, [%0], #8 \n" \
#define READNV12 \
MEMACCESS(0) \
"ld1 {v0.8b}, [%0], #8 \n" \
MEMACCESS(1) \
"ld1 {v2.8b}, [%1], #8 \n" \
"uzp1 v1.8b, v2.8b, v2.8b \n" \
@ -55,9 +55,9 @@ extern "C" {
"ins v1.s[1], v3.s[0] \n"
// Read 8 Y and 4 VU from NV21
#define READNV21 \
MEMACCESS(0) \
"ld1 {v0.8b}, [%0], #8 \n" \
#define READNV21 \
MEMACCESS(0) \
"ld1 {v0.8b}, [%0], #8 \n" \
MEMACCESS(1) \
"ld1 {v2.8b}, [%1], #8 \n" \
"uzp1 v3.8b, v2.8b, v2.8b \n" \
@ -65,57 +65,65 @@ extern "C" {
"ins v1.s[1], v3.s[0] \n"
// Read 8 YUY2
#define READYUY2 \
MEMACCESS(0) \
"ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
"uzp2 v3.8b, v1.8b, v1.8b \n" \
"uzp1 v1.8b, v1.8b, v1.8b \n" \
"ins v1.s[1], v3.s[0] \n"
#define READYUY2 \
MEMACCESS(0) \
"ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
"uzp2 v3.8b, v1.8b, v1.8b \n" \
"uzp1 v1.8b, v1.8b, v1.8b \n" \
"ins v1.s[1], v3.s[0] \n"
// Read 8 UYVY
#define READUYVY \
MEMACCESS(0) \
"ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
"orr v0.8b, v3.8b, v3.8b \n" \
"uzp1 v1.8b, v2.8b, v2.8b \n" \
"uzp2 v3.8b, v2.8b, v2.8b \n" \
"ins v1.s[1], v3.s[0] \n"
#define READUYVY \
MEMACCESS(0) \
"ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
"orr v0.8b, v3.8b, v3.8b \n" \
"uzp1 v1.8b, v2.8b, v2.8b \n" \
"uzp2 v3.8b, v2.8b, v2.8b \n" \
"ins v1.s[1], v3.s[0] \n"
#define YUVTORGB_SETUP \
"ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
"ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
"ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
"ld1r {v31.4s}, [%[kYToRgb]] \n" \
"ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
"ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
#define YUVTORGB_SETUP \
"ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
"ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
"ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
"ld1r {v31.4s}, [%[kYToRgb]] \n" \
"ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
"ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
#define YUVTORGB(vR, vG, vB) \
"uxtl v0.8h, v0.8b \n" /* Extract Y */ \
"shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
"ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
"ushll v0.4s, v0.4h, #0 \n" \
"mul v3.4s, v3.4s, v31.4s \n" \
"mul v0.4s, v0.4s, v31.4s \n" \
"sqshrun v0.4h, v0.4s, #16 \n" \
"sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
"uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
"mov v2.d[0], v1.d[1] \n" /* Extract V */ \
"uxtl v2.8h, v2.8b \n" \
"uxtl v1.8h, v1.8b \n" /* Extract U */ \
"mul v3.8h, v1.8h, v27.8h \n" \
"mul v5.8h, v1.8h, v29.8h \n" \
"mul v6.8h, v2.8h, v30.8h \n" \
"mul v7.8h, v2.8h, v28.8h \n" \
"sqadd v6.8h, v6.8h, v5.8h \n" \
"sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \
"sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \
"sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \
"sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \
"sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \
"sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \
"sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \
"sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \
"sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \
#define YUVTORGB(vR, vG, vB) \
"uxtl v0.8h, v0.8b \n" /* Extract Y */ \
"shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
"ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
"ushll v0.4s, v0.4h, #0 \n" \
"mul v3.4s, v3.4s, v31.4s \n" \
"mul v0.4s, v0.4s, v31.4s \n" \
"sqshrun v0.4h, v0.4s, #16 \n" \
"sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
"uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
"mov v2.d[0], v1.d[1] \n" /* Extract V */ \
"uxtl v2.8h, v2.8b \n" \
"uxtl v1.8h, v1.8b \n" /* Extract U */ \
"mul v3.8h, v1.8h, v27.8h \n" \
"mul v5.8h, v1.8h, v29.8h \n" \
"mul v6.8h, v2.8h, v30.8h \n" \
"mul v7.8h, v2.8h, v28.8h \n" \
"sqadd v6.8h, v6.8h, v5.8h \n" \
"sqadd " #vB \
".8h, v24.8h, v0.8h \n" /* B */ \
"sqadd " #vG \
".8h, v25.8h, v0.8h \n" /* G */ \
"sqadd " #vR \
".8h, v26.8h, v0.8h \n" /* R */ \
"sqadd " #vB ".8h, " #vB \
".8h, v3.8h \n" /* B */ \
"sqsub " #vG ".8h, " #vG \
".8h, v6.8h \n" /* G */ \
"sqadd " #vR ".8h, " #vR \
".8h, v7.8h \n" /* R */ \
"sqshrun " #vB ".8b, " #vB \
".8h, #6 \n" /* B */ \
"sqshrun " #vG ".8b, " #vG \
".8h, #6 \n" /* G */ \
"sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */
void I444ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u,
@ -269,12 +277,12 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
);
}
#define ARGBTORGB565 \
"shll v0.8h, v22.8b, #8 \n" /* R */ \
"shll v21.8h, v21.8b, #8 \n" /* G */ \
"shll v20.8h, v20.8b, #8 \n" /* B */ \
"sri v0.8h, v21.8h, #5 \n" /* RG */ \
"sri v0.8h, v20.8h, #11 \n" /* RGB */
#define ARGBTORGB565 \
"shll v0.8h, v22.8b, #8 \n" /* R */ \
"shll v21.8h, v21.8b, #8 \n" /* G */ \
"shll v20.8h, v20.8b, #8 \n" /* B */ \
"sri v0.8h, v21.8h, #5 \n" /* RG */ \
"sri v0.8h, v20.8h, #11 \n" /* RGB */
void I422ToRGB565Row_NEON(const uint8* src_y,
const uint8* src_u,
@ -306,14 +314,14 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
);
}
#define ARGBTOARGB1555 \
"shll v0.8h, v23.8b, #8 \n" /* A */ \
"shll v22.8h, v22.8b, #8 \n" /* R */ \
"shll v21.8h, v21.8b, #8 \n" /* G */ \
"shll v20.8h, v20.8b, #8 \n" /* B */ \
"sri v0.8h, v22.8h, #1 \n" /* AR */ \
"sri v0.8h, v21.8h, #6 \n" /* ARG */ \
"sri v0.8h, v20.8h, #11 \n" /* ARGB */
#define ARGBTOARGB1555 \
"shll v0.8h, v23.8b, #8 \n" /* A */ \
"shll v22.8h, v22.8b, #8 \n" /* R */ \
"shll v21.8h, v21.8b, #8 \n" /* G */ \
"shll v20.8h, v20.8b, #8 \n" /* B */ \
"sri v0.8h, v22.8h, #1 \n" /* AR */ \
"sri v0.8h, v21.8h, #6 \n" /* ARG */ \
"sri v0.8h, v20.8h, #11 \n" /* ARGB */
void I422ToARGB1555Row_NEON(const uint8* src_y,
const uint8* src_u,
@ -346,15 +354,15 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
);
}
#define ARGBTOARGB4444 \
/* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
"ushr v20.8b, v20.8b, #4 \n" /* B */ \
"bic v21.8b, v21.8b, v4.8b \n" /* G */ \
"ushr v22.8b, v22.8b, #4 \n" /* R */ \
"bic v23.8b, v23.8b, v4.8b \n" /* A */ \
"orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
"orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
"zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
#define ARGBTOARGB4444 \
/* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
"ushr v20.8b, v20.8b, #4 \n" /* B */ \
"bic v21.8b, v21.8b, v4.8b \n" /* G */ \
"ushr v22.8b, v22.8b, #4 \n" /* R */ \
"bic v23.8b, v23.8b, v4.8b \n" /* A */ \
"orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
"orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
"zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
void I422ToARGB4444Row_NEON(const uint8* src_y,
const uint8* src_u,
@ -388,9 +396,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
);
}
void I400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
asm volatile (
YUVTORGB_SETUP
"movi v23.8b, #255 \n"
@ -413,9 +419,7 @@ void I400ToARGBRow_NEON(const uint8* src_y,
);
}
void J400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
asm volatile (
"movi v23.8b, #255 \n"
"1: \n"
@ -572,7 +576,9 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
}
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
void SplitUVRow_NEON(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"1: \n"
@ -594,7 +600,9 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
}
// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
void MergeUVRow_NEON(const uint8* src_u,
const uint8* src_v,
uint8* dst_uv,
int width) {
asm volatile (
"1: \n"
@ -688,7 +696,9 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
);
}
void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
void MirrorUVRow_NEON(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
// Start at end of source row.
@ -794,18 +804,18 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
);
}
#define RGB565TOARGB \
"shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
"shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
"ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
"orr v1.8b, v4.8b, v6.8b \n" /* G */ \
"xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
"ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
"xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
"shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
"ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
"orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
"dup v2.2D, v0.D[1] \n" /* R */
#define RGB565TOARGB \
"shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
"shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
"ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
"orr v1.8b, v4.8b, v6.8b \n" /* G */ \
"xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
"ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
"xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
"shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
"ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
"orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
"dup v2.2D, v0.D[1] \n" /* R */
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
asm volatile (
@ -826,44 +836,45 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
);
}
#define ARGB1555TOARGB \
"ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
"shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
"xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
\
"sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
"xtn2 v3.16b, v2.8h \n" \
\
"xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
"shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
\
"ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
"shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
"ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
\
"orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
"orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
"dup v1.2D, v0.D[1] \n" \
"dup v3.2D, v2.D[1] \n"
#define ARGB1555TOARGB \
"ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
"shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
"xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
\
"sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
"xtn2 v3.16b, v2.8h \n" \
\
"xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
"shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
\
"ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
"shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
"ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
\
"orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
"orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
"dup v1.2D, v0.D[1] \n" \
"dup v3.2D, v2.D[1] \n"
// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
#define RGB555TOARGB \
"ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
"shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
"xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
\
"xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
"shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
\
"ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
"shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
"ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
\
"orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
"orr v2.16b, v1.16b, v3.16b \n" /* R */ \
"dup v1.2D, v0.D[1] \n" /* G */ \
#define RGB555TOARGB \
"ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
"shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
"xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
\
"xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
"shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
\
"ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
"shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
"ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
\
"orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
"orr v2.16b, v1.16b, v3.16b \n" /* R */ \
"dup v1.2D, v0.D[1] \n" /* G */
void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
uint8* dst_argb,
int width) {
asm volatile (
"movi v3.8b, #255 \n" // Alpha
@ -883,19 +894,20 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
);
}
#define ARGB4444TOARGB \
"shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
"xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
"shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
"ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
"ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
"shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
"orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
"orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
"dup v0.2D, v2.D[1] \n" \
"dup v1.2D, v3.D[1] \n"
#define ARGB4444TOARGB \
"shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
"xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
"shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
"ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
"ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
"shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
"orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
"orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
"dup v0.2D, v2.D[1] \n" \
"dup v1.2D, v3.D[1] \n"
void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
uint8* dst_argb,
int width) {
asm volatile (
"1: \n"
@ -984,7 +996,9 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
);
}
void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"1: \n"
@ -1005,7 +1019,9 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
);
}
void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
void UYVYToUV422Row_NEON(const uint8* src_uyvy,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"1: \n"
@ -1026,8 +1042,11 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
);
}
void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int width) {
void YUY2ToUVRow_NEON(const uint8* src_yuy2,
int stride_yuy2,
uint8* dst_u,
uint8* dst_v,
int width) {
const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
asm volatile (
"1: \n"
@ -1054,8 +1073,11 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
);
}
void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int width) {
void UYVYToUVRow_NEON(const uint8* src_uyvy,
int stride_uyvy,
uint8* dst_u,
uint8* dst_v,
int width) {
const uint8* src_uyvyb = src_uyvy + stride_uyvy;
asm volatile (
"1: \n"
@ -1083,8 +1105,10 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
}
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int width) {
void ARGBShuffleRow_NEON(const uint8* src_argb,
uint8* dst_argb,
const uint8* shuffler,
int width) {
asm volatile (
MEMACCESS(3)
"ld1 {v2.16b}, [%3] \n" // shuffler
@ -1107,7 +1131,8 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
void I422ToYUY2Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_yuy2, int width) {
uint8* dst_yuy2,
int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@ -1134,7 +1159,8 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
void I422ToUYVYRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_uyvy, int width) {
uint8* dst_uyvy,
int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@ -1176,8 +1202,10 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
);
}
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width) {
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
uint8* dst_rgb,
const uint32 dither4,
int width) {
asm volatile (
"dup v1.4s, %w2 \n" // dither4
"1: \n"
@ -1199,7 +1227,8 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
);
}
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
void ARGBToARGB1555Row_NEON(const uint8* src_argb,
uint8* dst_argb1555,
int width) {
asm volatile (
"1: \n"
@ -1218,7 +1247,8 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
);
}
void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
void ARGBToARGB4444Row_NEON(const uint8* src_argb,
uint8* dst_argb4444,
int width) {
asm volatile (
"movi v4.16b, #0x0f \n" // bits to clear with vbic.
@ -1306,7 +1336,9 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
}
// 8x1 pixels.
void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
void ARGBToUV444Row_NEON(const uint8* src_argb,
uint8* dst_u,
uint8* dst_v,
int width) {
asm volatile (
"movi v24.8b, #112 \n" // UB / VR 0.875 coefficient
@ -1347,32 +1379,41 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
);
}
#define RGBTOUV_SETUP_REG \
"movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
"movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
"movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
"movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
"movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
"movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
#define RGBTOUV_SETUP_REG \
"movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
"movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
"movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
"movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
"movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
"movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
#define RGBTOUV(QB, QG, QR) \
"mul v3.8h, " #QB ",v20.8h \n" /* B */ \
"mul v4.8h, " #QR ",v20.8h \n" /* R */ \
"mls v3.8h, " #QG ",v21.8h \n" /* G */ \
"mls v4.8h, " #QG ",v24.8h \n" /* G */ \
"mls v3.8h, " #QR ",v22.8h \n" /* R */ \
"mls v4.8h, " #QB ",v23.8h \n" /* B */ \
"add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
"add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
"uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
"uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
#define RGBTOUV(QB, QG, QR) \
"mul v3.8h, " #QB \
",v20.8h \n" /* B */ \
"mul v4.8h, " #QR \
",v20.8h \n" /* R */ \
"mls v3.8h, " #QG \
",v21.8h \n" /* G */ \
"mls v4.8h, " #QG \
",v24.8h \n" /* G */ \
"mls v3.8h, " #QR \
",v22.8h \n" /* R */ \
"mls v4.8h, " #QB \
",v23.8h \n" /* B */ \
"add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
"add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
"uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
"uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
// TODO(fbarchard): consider ptrdiff_t for all strides.
void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
void ARGBToUVRow_NEON(const uint8* src_argb,
int src_stride_argb,
uint8* dst_u,
uint8* dst_v,
int width) {
const uint8* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
RGBTOUV_SETUP_REG
@ -1412,8 +1453,11 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
}
// TODO(fbarchard): Subsample match C code.
void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
void ARGBToUVJRow_NEON(const uint8* src_argb,
int src_stride_argb,
uint8* dst_u,
uint8* dst_v,
int width) {
const uint8* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
"movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
@ -1456,8 +1500,11 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
);
}
void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_u, uint8* dst_v, int width) {
void BGRAToUVRow_NEON(const uint8* src_bgra,
int src_stride_bgra,
uint8* dst_u,
uint8* dst_v,
int width) {
const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
asm volatile (
RGBTOUV_SETUP_REG
@ -1495,8 +1542,11 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
);
}
void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
uint8* dst_u, uint8* dst_v, int width) {
void ABGRToUVRow_NEON(const uint8* src_abgr,
int src_stride_abgr,
uint8* dst_u,
uint8* dst_v,
int width) {
const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
asm volatile (
RGBTOUV_SETUP_REG
@ -1534,8 +1584,11 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
);
}
void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
uint8* dst_u, uint8* dst_v, int width) {
void RGBAToUVRow_NEON(const uint8* src_rgba,
int src_stride_rgba,
uint8* dst_u,
uint8* dst_v,
int width) {
const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
asm volatile (
RGBTOUV_SETUP_REG
@ -1573,8 +1626,11 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
);
}
void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
uint8* dst_u, uint8* dst_v, int width) {
void RGB24ToUVRow_NEON(const uint8* src_rgb24,
int src_stride_rgb24,
uint8* dst_u,
uint8* dst_v,
int width) {
const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
asm volatile (
RGBTOUV_SETUP_REG
@ -1612,8 +1668,11 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
);
}
void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
uint8* dst_u, uint8* dst_v, int width) {
void RAWToUVRow_NEON(const uint8* src_raw,
int src_stride_raw,
uint8* dst_u,
uint8* dst_v,
int width) {
const uint8* src_raw_1 = src_raw + src_stride_raw;
asm volatile (
RGBTOUV_SETUP_REG
@ -1652,8 +1711,11 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
uint8* dst_u, uint8* dst_v, int width) {
void RGB565ToUVRow_NEON(const uint8* src_rgb565,
int src_stride_rgb565,
uint8* dst_u,
uint8* dst_v,
int width) {
const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
asm volatile (
"movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2
@ -1726,8 +1788,11 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
uint8* dst_u, uint8* dst_v, int width) {
void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
int src_stride_argb1555,
uint8* dst_u,
uint8* dst_v,
int width) {
const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
asm volatile (
RGBTOUV_SETUP_REG
@ -1795,8 +1860,11 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
uint8* dst_u, uint8* dst_v, int width) {
void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
int src_stride_argb4444,
uint8* dst_u,
uint8* dst_v,
int width) {
const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
asm volatile (
RGBTOUV_SETUP_REG
@ -2078,8 +2146,10 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
// Bilinear filter 16x2 -> 16x1
void InterpolateRow_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
const uint8* src_ptr,
ptrdiff_t src_stride,
int dst_width,
int source_y_fraction) {
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
const uint8* src_ptr1 = src_ptr + src_stride;
@ -2144,8 +2214,10 @@ void InterpolateRow_NEON(uint8* dst_ptr,
}
// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
void ARGBBlendRow_NEON(const uint8* src_argb0,
const uint8* src_argb1,
uint8* dst_argb,
int width) {
asm volatile (
"subs %w3, %w3, #8 \n"
"b.lt 89f \n"
@ -2240,8 +2312,11 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
// Quantize 8 ARGB pixels (32 bytes).
// dst = (dst * scale >> 16) * interval_size + interval_offset;
void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
int interval_offset, int width) {
void ARGBQuantizeRow_NEON(uint8* dst_argb,
int scale,
int interval_size,
int interval_offset,
int width) {
asm volatile (
"dup v4.8h, %w2 \n"
"ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
@ -2283,7 +2358,9 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
// Shade 8 pixels at a time by specified value.
// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
void ARGBShadeRow_NEON(const uint8* src_argb,
uint8* dst_argb,
int width,
uint32 value) {
asm volatile (
"dup v0.4s, %w3 \n" // duplicate scale value.
@ -2393,8 +2470,10 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
// needs to saturate. Consider doing a non-saturating version.
void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
const int8* matrix_argb, int width) {
void ARGBColorMatrixRow_NEON(const uint8* src_argb,
uint8* dst_argb,
const int8* matrix_argb,
int width) {
asm volatile (
MEMACCESS(3)
"ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
@ -2455,8 +2534,10 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
void ARGBMultiplyRow_NEON(const uint8* src_argb0,
const uint8* src_argb1,
uint8* dst_argb,
int width) {
asm volatile (
// 8 pixel loop.
"1: \n"
@ -2487,8 +2568,10 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
}
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
void ARGBAddRow_NEON(const uint8* src_argb0,
const uint8* src_argb1,
uint8* dst_argb,
int width) {
asm volatile (
// 8 pixel loop.
"1: \n"
@ -2515,8 +2598,10 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
}
// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
void ARGBSubtractRow_NEON(const uint8* src_argb0,
const uint8* src_argb1,
uint8* dst_argb,
int width) {
asm volatile (
// 8 pixel loop.
"1: \n"
@ -2547,8 +2632,10 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// R = Sobel
// G = Sobel
// B = Sobel
void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
void SobelRow_NEON(const uint8* src_sobelx,
const uint8* src_sobely,
uint8* dst_argb,
int width) {
asm volatile (
"movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
@ -2574,8 +2661,10 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
}
// Adds Sobel X and Sobel Y and stores Sobel into plane.
void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_y, int width) {
void SobelToPlaneRow_NEON(const uint8* src_sobelx,
const uint8* src_sobely,
uint8* dst_y,
int width) {
asm volatile (
// 16 pixel loop.
"1: \n"
@ -2602,8 +2691,10 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
// R = Sobel X
// G = Sobel
// B = Sobel Y
void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
void SobelXYRow_NEON(const uint8* src_sobelx,
const uint8* src_sobely,
uint8* dst_argb,
int width) {
asm volatile (
"movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
@ -2630,8 +2721,11 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
// -1 0 1
// -2 0 2
// -1 0 1
void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobelx, int width) {
void SobelXRow_NEON(const uint8* src_y0,
const uint8* src_y1,
const uint8* src_y2,
uint8* dst_sobelx,
int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@ -2673,8 +2767,10 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
// -1 -2 -1
// 0 0 0
// 1 2 1
void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width) {
void SobelYRow_NEON(const uint8* src_y0,
const uint8* src_y1,
uint8* dst_sobely,
int width) {
asm volatile (
"1: \n"
MEMACCESS(0)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -19,16 +19,15 @@ extern "C" {
#endif
// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
int dst_width, int x, int dx) { \
int n = dst_width & ~MASK; \
if (n > 0) { \
TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
} \
TERP_C(dst_ptr + n * BPP, src_ptr, \
dst_width & MASK, x + n * dx, dx); \
}
#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, \
int dx) { \
int n = dst_width & ~MASK; \
if (n > 0) { \
TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
} \
TERP_C(dst_ptr + n * BPP, src_ptr, dst_width & MASK, x + n * dx, dx); \
}
#ifdef HAS_SCALEFILTERCOLS_NEON
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
@ -37,167 +36,302 @@ CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
#endif
#ifdef HAS_SCALEARGBFILTERCOLS_NEON
CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
ScaleARGBFilterCols_C, 4, 3)
CANY(ScaleARGBFilterCols_Any_NEON,
ScaleARGBFilterCols_NEON,
ScaleARGBFilterCols_C,
4,
3)
#endif
#undef CANY
// Fixed scale down.
#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
uint8* dst_ptr, int dst_width) { \
int r = (int)((unsigned int)dst_width % (MASK + 1)); \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
dst_ptr + n * BPP, r); \
}
#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \
int dst_width) { \
int r = (int)((unsigned int)dst_width % (MASK + 1)); \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
dst_ptr + n * BPP, r); \
}
// Fixed scale down for odd source width. Used by I420Blend subsampling.
// Since dst_width is (width + 1) / 2, this function scales one less pixel
// and copies the last pixel.
#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
uint8* dst_ptr, int dst_width) { \
int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
dst_ptr + n * BPP, r); \
}
#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \
int dst_width) { \
int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
dst_ptr + n * BPP, r); \
}
#ifdef HAS_SCALEROWDOWN2_SSSE3
SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
ScaleRowDown2Linear_C, 2, 1, 15)
SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
2, 1, 15)
SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3,
ScaleRowDown2Box_Odd_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_SSSE3,
ScaleRowDown2Linear_SSSE3,
ScaleRowDown2Linear_C,
2,
1,
15)
SDANY(ScaleRowDown2Box_Any_SSSE3,
ScaleRowDown2Box_SSSE3,
ScaleRowDown2Box_C,
2,
1,
15)
SDODD(ScaleRowDown2Box_Odd_SSSE3,
ScaleRowDown2Box_SSSE3,
ScaleRowDown2Box_Odd_C,
2,
1,
15)
#endif
#ifdef HAS_SCALEROWDOWN2_AVX2
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
ScaleRowDown2Linear_C, 2, 1, 31)
SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
2, 1, 31)
SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C,
2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_AVX2,
ScaleRowDown2Linear_AVX2,
ScaleRowDown2Linear_C,
2,
1,
31)
SDANY(ScaleRowDown2Box_Any_AVX2,
ScaleRowDown2Box_AVX2,
ScaleRowDown2Box_C,
2,
1,
31)
SDODD(ScaleRowDown2Box_Odd_AVX2,
ScaleRowDown2Box_AVX2,
ScaleRowDown2Box_Odd_C,
2,
1,
31)
#endif
#ifdef HAS_SCALEROWDOWN2_NEON
SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
ScaleRowDown2Linear_C, 2, 1, 15)
SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
ScaleRowDown2Box_C, 2, 1, 15)
SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON,
ScaleRowDown2Box_Odd_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_NEON,
ScaleRowDown2Linear_NEON,
ScaleRowDown2Linear_C,
2,
1,
15)
SDANY(ScaleRowDown2Box_Any_NEON,
ScaleRowDown2Box_NEON,
ScaleRowDown2Box_C,
2,
1,
15)
SDODD(ScaleRowDown2Box_Odd_NEON,
ScaleRowDown2Box_NEON,
ScaleRowDown2Box_Odd_C,
2,
1,
15)
#endif
#ifdef HAS_SCALEROWDOWN4_SSSE3
SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,
4, 1, 7)
SDANY(ScaleRowDown4Box_Any_SSSE3,
ScaleRowDown4Box_SSSE3,
ScaleRowDown4Box_C,
4,
1,
7)
#endif
#ifdef HAS_SCALEROWDOWN4_AVX2
SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
4, 1, 15)
SDANY(ScaleRowDown4Box_Any_AVX2,
ScaleRowDown4Box_AVX2,
ScaleRowDown4Box_C,
4,
1,
15)
#endif
#ifdef HAS_SCALEROWDOWN4_NEON
SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
4, 1, 7)
SDANY(ScaleRowDown4Box_Any_NEON,
ScaleRowDown4Box_NEON,
ScaleRowDown4Box_C,
4,
1,
7)
#endif
#ifdef HAS_SCALEROWDOWN34_SSSE3
SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
ScaleRowDown34_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_Any_SSSE3,
ScaleRowDown34_SSSE3,
ScaleRowDown34_C,
4 / 3,
1,
23)
SDANY(ScaleRowDown34_0_Box_Any_SSSE3,
ScaleRowDown34_0_Box_SSSE3,
ScaleRowDown34_0_Box_C,
4 / 3,
1,
23)
SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
ScaleRowDown34_1_Box_SSSE3,
ScaleRowDown34_1_Box_C,
4 / 3,
1,
23)
#endif
#ifdef HAS_SCALEROWDOWN34_NEON
SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
ScaleRowDown34_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_Any_NEON,
ScaleRowDown34_NEON,
ScaleRowDown34_C,
4 / 3,
1,
23)
SDANY(ScaleRowDown34_0_Box_Any_NEON,
ScaleRowDown34_0_Box_NEON,
ScaleRowDown34_0_Box_C,
4 / 3,
1,
23)
SDANY(ScaleRowDown34_1_Box_Any_NEON,
ScaleRowDown34_1_Box_NEON,
ScaleRowDown34_1_Box_C,
4 / 3,
1,
23)
#endif
#ifdef HAS_SCALEROWDOWN38_SSSE3
SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
ScaleRowDown38_C, 8 / 3, 1, 11)
SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
SDANY(ScaleRowDown38_Any_SSSE3,
ScaleRowDown38_SSSE3,
ScaleRowDown38_C,
8 / 3,
1,
11)
SDANY(ScaleRowDown38_3_Box_Any_SSSE3,
ScaleRowDown38_3_Box_SSSE3,
ScaleRowDown38_3_Box_C,
8 / 3,
1,
5)
SDANY(ScaleRowDown38_2_Box_Any_SSSE3,
ScaleRowDown38_2_Box_SSSE3,
ScaleRowDown38_2_Box_C,
8 / 3,
1,
5)
#endif
#ifdef HAS_SCALEROWDOWN38_NEON
SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
ScaleRowDown38_C, 8 / 3, 1, 11)
SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
SDANY(ScaleRowDown38_Any_NEON,
ScaleRowDown38_NEON,
ScaleRowDown38_C,
8 / 3,
1,
11)
SDANY(ScaleRowDown38_3_Box_Any_NEON,
ScaleRowDown38_3_Box_NEON,
ScaleRowDown38_3_Box_C,
8 / 3,
1,
11)
SDANY(ScaleRowDown38_2_Box_Any_NEON,
ScaleRowDown38_2_Box_NEON,
ScaleRowDown38_2_Box_C,
8 / 3,
1,
11)
#endif
#ifdef HAS_SCALEARGBROWDOWN2_SSE2
SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
ScaleARGBRowDown2_C, 2, 4, 3)
SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
ScaleARGBRowDown2Linear_C, 2, 4, 3)
SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
ScaleARGBRowDown2Box_C, 2, 4, 3)
SDANY(ScaleARGBRowDown2_Any_SSE2,
ScaleARGBRowDown2_SSE2,
ScaleARGBRowDown2_C,
2,
4,
3)
SDANY(ScaleARGBRowDown2Linear_Any_SSE2,
ScaleARGBRowDown2Linear_SSE2,
ScaleARGBRowDown2Linear_C,
2,
4,
3)
SDANY(ScaleARGBRowDown2Box_Any_SSE2,
ScaleARGBRowDown2Box_SSE2,
ScaleARGBRowDown2Box_C,
2,
4,
3)
#endif
#ifdef HAS_SCALEARGBROWDOWN2_NEON
SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
ScaleARGBRowDown2_C, 2, 4, 7)
SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
ScaleARGBRowDown2Linear_C, 2, 4, 7)
SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
ScaleARGBRowDown2Box_C, 2, 4, 7)
SDANY(ScaleARGBRowDown2_Any_NEON,
ScaleARGBRowDown2_NEON,
ScaleARGBRowDown2_C,
2,
4,
7)
SDANY(ScaleARGBRowDown2Linear_Any_NEON,
ScaleARGBRowDown2Linear_NEON,
ScaleARGBRowDown2Linear_C,
2,
4,
7)
SDANY(ScaleARGBRowDown2Box_Any_NEON,
ScaleARGBRowDown2Box_NEON,
ScaleARGBRowDown2Box_C,
2,
4,
7)
#endif
#undef SDANY
// Scale down by even scale factor.
#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \
uint8* dst_ptr, int dst_width) { \
int r = (int)((unsigned int)dst_width % (MASK + 1)); \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, \
src_stepx, dst_ptr + n * BPP, r); \
}
#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \
uint8* dst_ptr, int dst_width) { \
int r = (int)((unsigned int)dst_width % (MASK + 1)); \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \
dst_ptr + n * BPP, r); \
}
#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
ScaleARGBRowDownEven_C, 4, 3)
SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
ScaleARGBRowDownEvenBox_C, 4, 3)
SDAANY(ScaleARGBRowDownEven_Any_SSE2,
ScaleARGBRowDownEven_SSE2,
ScaleARGBRowDownEven_C,
4,
3)
SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2,
ScaleARGBRowDownEvenBox_SSE2,
ScaleARGBRowDownEvenBox_C,
4,
3)
#endif
#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
ScaleARGBRowDownEven_C, 4, 3)
SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
ScaleARGBRowDownEvenBox_C, 4, 3)
SDAANY(ScaleARGBRowDownEven_Any_NEON,
ScaleARGBRowDownEven_NEON,
ScaleARGBRowDownEven_C,
4,
3)
SDAANY(ScaleARGBRowDownEvenBox_Any_NEON,
ScaleARGBRowDownEvenBox_NEON,
ScaleARGBRowDownEvenBox_C,
4,
3)
#endif
// Add rows box filter scale down.
#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \
int n = src_width & ~MASK; \
if (n > 0) { \
SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
} \
SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
}
#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \
int n = src_width & ~MASK; \
if (n > 0) { \
SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
} \
SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
}
#ifdef HAS_SCALEADDROW_SSE2
SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
@ -214,8 +348,3 @@ SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
} // extern "C"
} // namespace libyuv
#endif

View File

@ -30,20 +30,28 @@ static __inline int Abs(int v) {
// ScaleARGB ARGB, 1/2
// This is an optimized version for scaling down a ARGB to 1/2 of
// its original size.
static void ScaleARGBDown2(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int dx, int y, int dy,
static void ScaleARGBDown2(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8* src_argb,
uint8* dst_argb,
int x,
int dx,
int y,
int dy,
enum FilterMode filtering) {
int j;
int row_stride = src_stride * (dy >> 16);
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) =
filtering == kFilterNone ? ScaleARGBRowDown2_C :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
ScaleARGBRowDown2Box_C);
assert(dx == 65536 * 2); // Test scale factor of 2.
filtering == kFilterNone
? ScaleARGBRowDown2_C
: (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C
: ScaleARGBRowDown2Box_C);
assert(dx == 65536 * 2); // Test scale factor of 2.
assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
// Advance to odd row, even column.
if (filtering == kFilterBilinear) {
@ -54,25 +62,33 @@ static void ScaleARGBDown2(int src_width, int src_height,
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
ScaleARGBRowDown2Box_Any_SSE2);
ScaleARGBRowDown2 =
filtering == kFilterNone
? ScaleARGBRowDown2_Any_SSE2
: (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2
: ScaleARGBRowDown2Box_Any_SSE2);
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
ScaleARGBRowDown2Box_SSE2);
ScaleARGBRowDown2 =
filtering == kFilterNone
? ScaleARGBRowDown2_SSE2
: (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2
: ScaleARGBRowDown2Box_SSE2);
}
}
#endif
#if defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
ScaleARGBRowDown2Box_Any_NEON);
ScaleARGBRowDown2 =
filtering == kFilterNone
? ScaleARGBRowDown2_Any_NEON
: (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON
: ScaleARGBRowDown2Box_Any_NEON);
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
ScaleARGBRowDown2Box_NEON);
ScaleARGBRowDown2 =
filtering == kFilterNone
? ScaleARGBRowDown2_NEON
: (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON
: ScaleARGBRowDown2Box_NEON);
}
}
#endif
@ -90,21 +106,29 @@ static void ScaleARGBDown2(int src_width, int src_height,
// ScaleARGB ARGB, 1/4
// This is an optimized version for scaling down a ARGB to 1/4 of
// its original size.
static void ScaleARGBDown4Box(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int dx, int y, int dy) {
static void ScaleARGBDown4Box(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8* src_argb,
uint8* dst_argb,
int x,
int dx,
int y,
int dy) {
int j;
// Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
int row_stride = src_stride * (dy >> 16);
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
uint8* dst_argb, int dst_width) =
ScaleARGBRowDown2Box_C;
// Advance to odd row, even column.
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
assert(dx == 65536 * 4); // Test scale factor of 4.
assert(dx == 65536 * 4); // Test scale factor of 4.
assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
@ -125,8 +149,8 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
for (j = 0; j < dst_height; ++j) {
ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
row + kRowSize, dst_width * 2);
ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize,
dst_width * 2);
ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
src_argb += row_stride;
dst_argb += dst_stride;
@ -137,11 +161,18 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
// ScaleARGB ARGB Even
// This is an optimized version for scaling down a ARGB to even
// multiple of its original size.
static void ScaleARGBDownEven(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int dx, int y, int dy,
static void ScaleARGBDownEven(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8* src_argb,
uint8* dst_argb,
int x,
int dx,
int y,
int dy,
enum FilterMode filtering) {
int j;
int col_step = dx >> 16;
@ -154,21 +185,21 @@ static void ScaleARGBDownEven(int src_width, int src_height,
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
ScaleARGBRowDownEven_Any_SSE2;
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2
: ScaleARGBRowDownEven_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
ScaleARGBRowDownEven_SSE2;
ScaleARGBRowDownEven =
filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2;
}
}
#endif
#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
ScaleARGBRowDownEven_Any_NEON;
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON
: ScaleARGBRowDownEven_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
ScaleARGBRowDownEven_NEON;
ScaleARGBRowDownEven =
filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON;
}
}
#endif
@ -184,25 +215,32 @@ static void ScaleARGBDownEven(int src_width, int src_height,
}
// Scale ARGB down with bilinear interpolation.
static void ScaleARGBBilinearDown(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int dx, int y, int dy,
static void ScaleARGBBilinearDown(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8* src_argb,
uint8* dst_argb,
int x,
int dx,
int y,
int dy,
enum FilterMode filtering) {
int j;
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
(src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
int64 xlast = x + (int64)(dst_width - 1) * dx;
int64 xl = (dx >= 0) ? x : xlast;
int64 xr = (dx >= 0) ? xlast : x;
int clip_src_width;
xl = (xl >> 16) & ~3; // Left edge aligned.
xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
xl = (xl >> 16) & ~3; // Left edge aligned.
xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel.
if (xr > src_width) {
xr = src_width;
@ -235,8 +273,8 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) &&
IS_ALIGNED(src_stride, 4)) {
InterpolateRow = InterpolateRow_Any_DSPR2;
if (IS_ALIGNED(clip_src_width, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
@ -286,18 +324,25 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
}
// Scale ARGB up with bilinear interpolation.
static void ScaleARGBBilinearUp(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int dx, int y, int dy,
static void ScaleARGBBilinearUp(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8* src_argb,
uint8* dst_argb,
int x,
int dx,
int y,
int dy,
enum FilterMode filtering) {
int j;
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
const int max_y = (src_height - 1) << 16;
#if defined(HAS_INTERPOLATEROW_SSSE3)
@ -325,14 +370,14 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_argb, 4) &&
IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
}
#endif
if (src_width >= 32768) {
ScaleARGBFilterCols = filtering ?
ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
ScaleARGBFilterCols =
filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
}
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@ -423,8 +468,10 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
#ifdef YUVSCALEUP
// Scale YUV to ARGB up with bilinear interpolation.
static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
int dst_width, int dst_height,
static void ScaleYUVToARGBBilinearUp(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride_y,
int src_stride_u,
int src_stride_v,
@ -433,14 +480,15 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int x, int dx, int y, int dy,
int x,
int dx,
int y,
int dy,
enum FilterMode filtering) {
int j;
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToARGBRow_C;
void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
const uint8* v_buf, uint8* rgb_buf, int width) =
I422ToARGBRow_C;
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
@ -483,9 +531,9 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
}
#endif
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@ -511,18 +559,18 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_argb, 4) &&
IS_ALIGNED(dst_stride_argb, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
}
#endif
void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
if (src_width >= 32768) {
ScaleARGBFilterCols = filtering ?
ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
ScaleARGBFilterCols =
filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
}
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@ -643,14 +691,21 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
// of x and dx is the integer part of the source position and
// the lower 16 bits are the fixed decimal part.
static void ScaleARGBSimple(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int dx, int y, int dy) {
static void ScaleARGBSimple(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8* src_argb,
uint8* dst_argb,
int x,
int dx,
int y,
int dy) {
int j;
void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
void (*ScaleARGBCols)(uint8 * dst_argb, const uint8* src_argb, int dst_width,
int x, int dx) =
(src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
@ -675,8 +730,8 @@ static void ScaleARGBSimple(int src_width, int src_height,
}
for (j = 0; j < dst_height; ++j) {
ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
dst_width, x, dx);
ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x,
dx);
dst_argb += dst_stride;
y += dy;
}
@ -685,11 +740,18 @@ static void ScaleARGBSimple(int src_width, int src_height,
// ScaleARGB a ARGB.
// This function in turn calls a scaling function
// suitable for handling the desired resolutions.
static void ScaleARGB(const uint8* src, int src_stride,
int src_width, int src_height,
uint8* dst, int dst_stride,
int dst_width, int dst_height,
int clip_x, int clip_y, int clip_width, int clip_height,
static void ScaleARGB(const uint8* src,
int src_stride,
int src_width,
int src_height,
uint8* dst,
int dst_stride,
int dst_width,
int dst_height,
int clip_x,
int clip_y,
int clip_width,
int clip_height,
enum FilterMode filtering) {
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
@ -698,8 +760,7 @@ static void ScaleARGB(const uint8* src, int src_stride,
int dy = 0;
// ARGB does not support box filter yet, but allow the user to pass it.
// Simplify filtering when possible.
filtering = ScaleFilterReduce(src_width, src_height,
dst_width, dst_height,
filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
filtering);
// Negative src_height means invert the image.
@ -708,17 +769,17 @@ static void ScaleARGB(const uint8* src, int src_stride,
src = src + (src_height - 1) * src_stride;
src_stride = -src_stride;
}
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
&x, &y, &dx, &dy);
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
&dx, &dy);
src_width = Abs(src_width);
if (clip_x) {
int64 clipf = (int64)(clip_x) * dx;
int64 clipf = (int64)(clip_x)*dx;
x += (clipf & 0xffff);
src += (clipf >> 16) * 4;
dst += clip_x * 4;
}
if (clip_y) {
int64 clipf = (int64)(clip_y) * dy;
int64 clipf = (int64)(clip_y)*dy;
y += (clipf & 0xffff);
src += (clipf >> 16) * src_stride;
dst += clip_y * dst_stride;
@ -733,24 +794,20 @@ static void ScaleARGB(const uint8* src, int src_stride,
if (!(dx & 0x10000) && !(dy & 0x10000)) {
if (dx == 0x20000) {
// Optimized 1/2 downsample.
ScaleARGBDown2(src_width, src_height,
clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy, filtering);
ScaleARGBDown2(src_width, src_height, clip_width, clip_height,
src_stride, dst_stride, src, dst, x, dx, y, dy,
filtering);
return;
}
if (dx == 0x40000 && filtering == kFilterBox) {
// Optimized 1/4 box downsample.
ScaleARGBDown4Box(src_width, src_height,
clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy);
ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height,
src_stride, dst_stride, src, dst, x, dx, y, dy);
return;
}
ScaleARGBDownEven(src_width, src_height,
clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy, filtering);
ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,
src_stride, dst_stride, src, dst, x, dx, y, dy,
filtering);
return;
}
// Optimized odd scale down. ie 3, 5, 7, 9x.
@ -767,96 +824,103 @@ static void ScaleARGB(const uint8* src, int src_stride,
}
if (dx == 0x10000 && (x & 0xffff) == 0) {
// Arbitrary scale vertically, but unscaled vertically.
ScalePlaneVertical(src_height,
clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, y, dy, 4, filtering);
ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
dst_stride, src, dst, x, y, dy, 4, filtering);
return;
}
if (filtering && dy < 65536) {
ScaleARGBBilinearUp(src_width, src_height,
clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy, filtering);
ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height,
src_stride, dst_stride, src, dst, x, dx, y, dy,
filtering);
return;
}
if (filtering) {
ScaleARGBBilinearDown(src_width, src_height,
clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy, filtering);
ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height,
src_stride, dst_stride, src, dst, x, dx, y, dy,
filtering);
return;
}
ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy);
ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride,
dst_stride, src, dst, x, dx, y, dy);
}
LIBYUV_API
int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
int dst_width, int dst_height,
int clip_x, int clip_y, int clip_width, int clip_height,
int ARGBScaleClip(const uint8* src_argb,
int src_stride_argb,
int src_width,
int src_height,
uint8* dst_argb,
int dst_stride_argb,
int dst_width,
int dst_height,
int clip_x,
int clip_y,
int clip_width,
int clip_height,
enum FilterMode filtering) {
if (!src_argb || src_width == 0 || src_height == 0 ||
!dst_argb || dst_width <= 0 || dst_height <= 0 ||
clip_x < 0 || clip_y < 0 ||
if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb ||
dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 ||
clip_width > 32768 || clip_height > 32768 ||
(clip_x + clip_width) > dst_width ||
(clip_y + clip_height) > dst_height) {
return -1;
}
ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
dst_argb, dst_stride_argb, dst_width, dst_height,
clip_x, clip_y, clip_width, clip_height, filtering);
ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width,
clip_height, filtering);
return 0;
}
// Scale an ARGB image.
LIBYUV_API
int ARGBScale(const uint8* src_argb, int src_stride_argb,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
int dst_width, int dst_height,
int ARGBScale(const uint8* src_argb,
int src_stride_argb,
int src_width,
int src_height,
uint8* dst_argb,
int dst_stride_argb,
int dst_width,
int dst_height,
enum FilterMode filtering) {
if (!src_argb || src_width == 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 ||
!dst_argb || dst_width <= 0 || dst_height <= 0) {
if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 ||
src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) {
return -1;
}
ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
dst_argb, dst_stride_argb, dst_width, dst_height,
0, 0, dst_width, dst_height, filtering);
ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height,
filtering);
return 0;
}
// Scale with YUV conversion to ARGB and clipping.
LIBYUV_API
int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
int YUVToARGBScaleClip(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint32 src_fourcc,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
int src_width,
int src_height,
uint8* dst_argb,
int dst_stride_argb,
uint32 dst_fourcc,
int dst_width, int dst_height,
int clip_x, int clip_y, int clip_width, int clip_height,
int dst_width,
int dst_height,
int clip_x,
int clip_y,
int clip_width,
int clip_height,
enum FilterMode filtering) {
uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
int r;
I420ToARGB(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
argb_buffer, src_width * 4,
src_width, src_height);
I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
argb_buffer, src_width * 4, src_width, src_height);
r = ARGBScaleClip(argb_buffer, src_width * 4,
src_width, src_height,
dst_argb, dst_stride_argb,
dst_width, dst_height,
clip_x, clip_y, clip_width, clip_height,
filtering);
r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb,
dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
clip_width, clip_height, filtering);
free(argb_buffer);
return r;
}

View File

@ -28,8 +28,10 @@ static __inline int Abs(int v) {
}
// CPU agnostic row functions
void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = src_ptr[1];
@ -42,8 +44,10 @@ void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
void ScaleRowDown2_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = src_ptr[1];
@ -56,8 +60,10 @@ void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2Linear_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
const uint8* s = src_ptr;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
@ -71,8 +77,10 @@ void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
void ScaleRowDown2Linear_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
const uint16* s = src_ptr;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
@ -86,8 +94,10 @@ void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
int x;
@ -103,8 +113,10 @@ void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2Box_Odd_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
int x;
@ -125,8 +137,10 @@ void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
dst[0] = (s[0] + t[0] + 1) >> 1;
}
void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
void ScaleRowDown2Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
const uint16* s = src_ptr;
const uint16* t = src_ptr + src_stride;
int x;
@ -142,8 +156,10 @@ void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown4_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = src_ptr[2];
@ -156,8 +172,10 @@ void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
void ScaleRowDown4_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = src_ptr[2];
@ -170,80 +188,86 @@ void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown4Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
intptr_t stride = src_stride;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2] + src_ptr[stride + 3] +
src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
8) >> 4;
src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
src_ptr[stride * 3 + 3] + 8) >>
4;
dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
src_ptr[stride + 4] + src_ptr[stride + 5] +
src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
8) >> 4;
src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
src_ptr[stride * 3 + 7] + 8) >>
4;
dst += 2;
src_ptr += 8;
}
if (dst_width & 1) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2] + src_ptr[stride + 3] +
src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
8) >> 4;
src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
src_ptr[stride * 3 + 3] + 8) >>
4;
}
}
void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
void ScaleRowDown4Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
intptr_t stride = src_stride;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2] + src_ptr[stride + 3] +
src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
8) >> 4;
src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
src_ptr[stride * 3 + 3] + 8) >>
4;
dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
src_ptr[stride + 4] + src_ptr[stride + 5] +
src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
8) >> 4;
src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
src_ptr[stride * 3 + 7] + 8) >>
4;
dst += 2;
src_ptr += 8;
}
if (dst_width & 1) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2] + src_ptr[stride + 3] +
src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
8) >> 4;
src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
src_ptr[stride * 3 + 3] + 8) >>
4;
}
}
void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown34_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
int x;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 3) {
@ -255,8 +279,10 @@ void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
void ScaleRowDown34_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
int x;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 3) {
@ -269,8 +295,10 @@ void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
// Filter rows 0 and 1 together, 3 : 1
void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
void ScaleRowDown34_0_Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* d,
int dst_width) {
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
int x;
@ -291,8 +319,10 @@ void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* d, int dst_width) {
void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* d,
int dst_width) {
const uint16* s = src_ptr;
const uint16* t = src_ptr + src_stride;
int x;
@ -314,8 +344,10 @@ void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
// Filter rows 1 and 2 together, 1 : 1
void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
void ScaleRowDown34_1_Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* d,
int dst_width) {
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
int x;
@ -336,8 +368,10 @@ void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* d, int dst_width) {
void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* d,
int dst_width) {
const uint16* s = src_ptr;
const uint16* t = src_ptr + src_stride;
int x;
@ -359,8 +393,11 @@ void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
// Scales a single row of pixels using point sampling.
void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
void ScaleCols_C(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx) {
int j;
for (j = 0; j < dst_width - 1; j += 2) {
dst_ptr[0] = src_ptr[x >> 16];
@ -374,8 +411,11 @@ void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
}
}
void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx) {
void ScaleCols_16_C(uint16* dst_ptr,
const uint16* src_ptr,
int dst_width,
int x,
int dx) {
int j;
for (j = 0; j < dst_width - 1; j += 2) {
dst_ptr[0] = src_ptr[x >> 16];
@ -390,8 +430,11 @@ void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
}
// Scales a single row of pixels up by 2x using point sampling.
void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
void ScaleColsUp2_C(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx) {
int j;
for (j = 0; j < dst_width - 1; j += 2) {
dst_ptr[1] = dst_ptr[0] = src_ptr[0];
@ -403,8 +446,11 @@ void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
}
}
void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx) {
void ScaleColsUp2_16_C(uint16* dst_ptr,
const uint16* src_ptr,
int dst_width,
int x,
int dx) {
int j;
for (j = 0; j < dst_width - 1; j += 2) {
dst_ptr[1] = dst_ptr[0] = src_ptr[0];
@ -418,16 +464,19 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
// (1-f)a + fb can be replaced with a + f(b-a)
#if defined(__arm__) || defined(__aarch64__)
#define BLENDER(a, b, f) (uint8)((int)(a) + \
((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
#define BLENDER(a, b, f) \
(uint8)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
#else
// Intel uses 7 bit math with rounding.
#define BLENDER(a, b, f) (uint8)((int)(a) + \
(((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
#define BLENDER(a, b, f) \
(uint8)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
#endif
void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
void ScaleFilterCols_C(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx) {
int j;
for (j = 0; j < dst_width - 1; j += 2) {
int xi = x >> 16;
@ -450,8 +499,11 @@ void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
}
}
void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x32, int dx) {
void ScaleFilterCols64_C(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x32,
int dx) {
int64 x = (int64)(x32);
int j;
for (j = 0; j < dst_width - 1; j += 2) {
@ -477,11 +529,14 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
#undef BLENDER
// Same as 8 bit arm blender but return is cast to uint16
#define BLENDER(a, b, f) (uint16)((int)(a) + \
((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
#define BLENDER(a, b, f) \
(uint16)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx) {
void ScaleFilterCols_16_C(uint16* dst_ptr,
const uint16* src_ptr,
int dst_width,
int x,
int dx) {
int j;
for (j = 0; j < dst_width - 1; j += 2) {
int xi = x >> 16;
@ -504,8 +559,11 @@ void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
}
}
void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x32, int dx) {
void ScaleFilterCols64_16_C(uint16* dst_ptr,
const uint16* src_ptr,
int dst_width,
int x32,
int dx) {
int64 x = (int64)(x32);
int j;
for (j = 0; j < dst_width - 1; j += 2) {
@ -530,8 +588,10 @@ void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
}
#undef BLENDER
void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown38_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
int x;
assert(dst_width % 3 == 0);
for (x = 0; x < dst_width; x += 3) {
@ -543,8 +603,10 @@ void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
void ScaleRowDown38_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
int x;
assert(dst_width % 3 == 0);
for (x = 0; x < dst_width; x += 3) {
@ -559,25 +621,29 @@ void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
// 8x3 -> 3x1
void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
(65536 / 9) >> 16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[stride + 3] + src_ptr[stride + 4] +
src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
(65536 / 9) >> 16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
(65536 / 6) >> 16;
dst_ptr[0] =
(src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
(65536 / 9) >>
16;
dst_ptr[1] =
(src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
(65536 / 9) >>
16;
dst_ptr[2] =
(src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
(65536 / 6) >>
16;
src_ptr += 8;
dst_ptr += 3;
}
@ -585,66 +651,80 @@ void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width) {
uint16* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
(65536 / 9) >> 16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[stride + 3] + src_ptr[stride + 4] +
src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
(65536 / 9) >> 16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
(65536 / 6) >> 16;
dst_ptr[0] =
(src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
(65536 / 9) >>
16;
dst_ptr[1] =
(src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
(65536 / 9) >>
16;
dst_ptr[2] =
(src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
(65536 / 6) >>
16;
src_ptr += 8;
dst_ptr += 3;
}
}
// 8x2 -> 3x1
void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown38_2_Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2]) * (65536 / 6) >> 16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[stride + 3] + src_ptr[stride + 4] +
src_ptr[stride + 5]) * (65536 / 6) >> 16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
src_ptr[stride + 6] + src_ptr[stride + 7]) *
(65536 / 4) >> 16;
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
src_ptr[stride + 1] + src_ptr[stride + 2]) *
(65536 / 6) >>
16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
src_ptr[stride + 4] + src_ptr[stride + 5]) *
(65536 / 6) >>
16;
dst_ptr[2] =
(src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
(65536 / 4) >>
16;
src_ptr += 8;
dst_ptr += 3;
}
}
void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width) {
void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2]) * (65536 / 6) >> 16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[stride + 3] + src_ptr[stride + 4] +
src_ptr[stride + 5]) * (65536 / 6) >> 16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
src_ptr[stride + 6] + src_ptr[stride + 7]) *
(65536 / 4) >> 16;
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
src_ptr[stride + 1] + src_ptr[stride + 2]) *
(65536 / 6) >>
16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
src_ptr[stride + 4] + src_ptr[stride + 5]) *
(65536 / 6) >>
16;
dst_ptr[2] =
(src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
(65536 / 4) >>
16;
src_ptr += 8;
dst_ptr += 3;
}
@ -680,7 +760,8 @@ void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
void ScaleARGBRowDown2_C(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
uint8* dst_argb,
int dst_width) {
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
@ -698,7 +779,8 @@ void ScaleARGBRowDown2_C(const uint8* src_argb,
void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
uint8* dst_argb,
int dst_width) {
int x;
for (x = 0; x < dst_width; ++x) {
dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
@ -710,26 +792,34 @@ void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
}
}
void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
void ScaleARGBRowDown2Box_C(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb,
int dst_width) {
int x;
for (x = 0; x < dst_width; ++x) {
dst_argb[0] = (src_argb[0] + src_argb[4] +
src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
dst_argb[1] = (src_argb[1] + src_argb[5] +
src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
dst_argb[2] = (src_argb[2] + src_argb[6] +
src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
dst_argb[3] = (src_argb[3] + src_argb[7] +
src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
src_argb[src_stride + 4] + 2) >>
2;
dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
src_argb[src_stride + 5] + 2) >>
2;
dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
src_argb[src_stride + 6] + 2) >>
2;
dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
src_argb[src_stride + 7] + 2) >>
2;
src_argb += 8;
dst_argb += 4;
}
}
void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
void ScaleARGBRowDownEven_C(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width) {
uint8* dst_argb,
int dst_width) {
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
@ -748,25 +838,33 @@ void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width) {
uint8* dst_argb,
int dst_width) {
int x;
for (x = 0; x < dst_width; ++x) {
dst_argb[0] = (src_argb[0] + src_argb[4] +
src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
dst_argb[1] = (src_argb[1] + src_argb[5] +
src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
dst_argb[2] = (src_argb[2] + src_argb[6] +
src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
dst_argb[3] = (src_argb[3] + src_argb[7] +
src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
src_argb[src_stride + 4] + 2) >>
2;
dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
src_argb[src_stride + 5] + 2) >>
2;
dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
src_argb[src_stride + 6] + 2) >>
2;
dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
src_argb[src_stride + 7] + 2) >>
2;
src_argb += src_stepx * 4;
dst_argb += 4;
}
}
// Scales a single row of pixels using point sampling.
void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
void ScaleARGBCols_C(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
int j;
@ -782,8 +880,11 @@ void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
}
}
void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x32, int dx) {
void ScaleARGBCols64_C(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x32,
int dx) {
int64 x = (int64)(x32);
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
@ -801,8 +902,11 @@ void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
}
// Scales a single row of pixels up by 2x using point sampling.
void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
void ScaleARGBColsUp2_C(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
int j;
@ -818,15 +922,18 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607.
// Mimics SSSE3 blender
#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
#define BLENDERC(a, b, f, s) (uint32)( \
BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
#define BLENDER(a, b, f) \
BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
#define BLENDERC(a, b, f, s) \
(uint32)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
#define BLENDER(a, b, f) \
BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \
BLENDERC(a, b, f, 0)
void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
void ScaleARGBFilterCols_C(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
int j;
@ -854,8 +961,11 @@ void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
}
}
void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x32, int dx) {
void ScaleARGBFilterCols64_C(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x32,
int dx) {
int64 x = (int64)(x32);
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
@ -889,16 +999,22 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
// Scale plane vertically with bilinear interpolation.
void ScalePlaneVertical(int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int y, int dy,
int bpp, enum FilterMode filtering) {
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8* src_argb,
uint8* dst_argb,
int x,
int y,
int dy,
int bpp,
enum FilterMode filtering) {
// TODO(fbarchard): Allow higher bpp.
int dst_width_bytes = dst_width * bpp;
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
int j;
assert(bpp >= 1 && bpp <= 4);
@ -931,9 +1047,9 @@ void ScalePlaneVertical(int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) &&
IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_Any_DSPR2;
if (IS_ALIGNED(dst_width_bytes, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
@ -948,23 +1064,29 @@ void ScalePlaneVertical(int src_height,
}
yi = y >> 16;
yf = filtering ? ((y >> 8) & 255) : 0;
InterpolateRow(dst_argb, src_argb + yi * src_stride,
src_stride, dst_width_bytes, yf);
InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
dst_width_bytes, yf);
dst_argb += dst_stride;
y += dy;
}
}
void ScalePlaneVertical_16(int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint16* src_argb, uint16* dst_argb,
int x, int y, int dy,
int wpp, enum FilterMode filtering) {
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint16* src_argb,
uint16* dst_argb,
int x,
int y,
int dy,
int wpp,
enum FilterMode filtering) {
// TODO(fbarchard): Allow higher wpp.
int dst_width_words = dst_width * wpp;
void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_16_C;
void (*InterpolateRow)(uint16 * dst_argb, const uint16* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_16_C;
const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
int j;
assert(wpp >= 1 && wpp <= 2);
@ -1005,9 +1127,9 @@ void ScalePlaneVertical_16(int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_16_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) &&
IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_Any_16_DSPR2;
if (IS_ALIGNED(dst_width_bytes, 4)) {
InterpolateRow = InterpolateRow_16_DSPR2;
@ -1022,16 +1144,18 @@ void ScalePlaneVertical_16(int src_height,
}
yi = y >> 16;
yf = filtering ? ((y >> 8) & 255) : 0;
InterpolateRow(dst_argb, src_argb + yi * src_stride,
src_stride, dst_width_words, yf);
InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
dst_width_words, yf);
dst_argb += dst_stride;
y += dy;
}
}
// Simplify the filtering based on scale factors.
enum FilterMode ScaleFilterReduce(int src_width, int src_height,
int dst_width, int dst_height,
enum FilterMode ScaleFilterReduce(int src_width,
int src_height,
int dst_width,
int dst_height,
enum FilterMode filtering) {
if (src_width < 0) {
src_width = -src_width;
@ -1078,17 +1202,21 @@ int FixedDiv_C(int num, int div) {
// Divide num by div and return as 16.16 fixed point result.
int FixedDiv1_C(int num, int div) {
return (int)((((int64)(num) << 16) - 0x00010001) /
(div - 1));
return (int)((((int64)(num) << 16) - 0x00010001) / (div - 1));
}
#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
// Compute slope values for stepping.
void ScaleSlope(int src_width, int src_height,
int dst_width, int dst_height,
void ScaleSlope(int src_width,
int src_height,
int dst_width,
int dst_height,
enum FilterMode filtering,
int* x, int* y, int* dx, int* dy) {
int* x,
int* y,
int* dx,
int* dy) {
assert(x != NULL);
assert(y != NULL);
assert(dx != NULL);
@ -1120,7 +1248,7 @@ void ScaleSlope(int src_width, int src_height,
*x = 0;
}
if (dst_height <= src_height) {
*dy = FixedDiv(src_height, dst_height);
*dy = FixedDiv(src_height, dst_height);
*y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter.
} else if (dst_height > 1) {
*dy = FixedDiv1(src_height, dst_height);

View File

@ -21,85 +21,81 @@ extern "C" {
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
// Offsets for source bytes 0 to 9
static uvec8 kShuf0 =
{ 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
static uvec8 kShuf1 =
{ 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
static uvec8 kShuf2 =
{ 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 0 to 10
static uvec8 kShuf01 =
{ 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
static uvec8 kShuf11 =
{ 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
static uvec8 kShuf21 =
{ 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
10, 11, 12, 13, 13, 14, 14, 15};
// Coefficients for source bytes 0 to 10
static uvec8 kMadd01 =
{ 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
// Coefficients for source bytes 10 to 21
static uvec8 kMadd11 =
{ 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
// Coefficients for source bytes 21 to 31
static uvec8 kMadd21 =
{ 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
// Coefficients for source bytes 21 to 31
static vec16 kRound34 =
{ 2, 2, 2, 2, 2, 2, 2, 2 };
static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
static uvec8 kShuf38a =
{ 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
128, 128, 128, 128, 128, 128, 128, 128};
static uvec8 kShuf38b =
{ 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
6, 8, 11, 14, 128, 128, 128, 128};
// Arrange words 0,3,6 into 0,1,2
static uvec8 kShufAc =
{ 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
128, 128, 128, 128, 128, 128, 128, 128};
// Arrange words 0,3,6 into 3,4,5
static uvec8 kShufAc3 =
{ 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
6, 7, 12, 13, 128, 128, 128, 128};
// Scaling values for boxes of 3x3 and 2x3
static uvec16 kScaleAc33 =
{ 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
65536 / 9, 65536 / 6, 0, 0};
// Arrange first value for pixels 0,1,2,3,4,5
static uvec8 kShufAb0 =
{ 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
11, 128, 14, 128, 128, 128, 128, 128};
// Arrange second value for pixels 0,1,2,3,4,5
static uvec8 kShufAb1 =
{ 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
12, 128, 15, 128, 128, 128, 128, 128};
// Arrange third value for pixels 0,1,2,3,4,5
static uvec8 kShufAb2 =
{ 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
13, 128, 128, 128, 128, 128, 128, 128};
// Scaling values for boxes of 3x2 and 2x2
static uvec16 kScaleAb2 =
{ 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
65536 / 3, 65536 / 2, 0, 0};
// GCC versions of row functions are verbatim conversions from Visual C.
// Generated using gcc disassembly on Visual C object file:
// objdump -D yuvscaler.obj >yuvscaler.txt
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown2_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
asm volatile (
LABELALIGN
"1: \n"
@ -120,8 +116,10 @@ void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n"
@ -149,8 +147,10 @@ void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n"
@ -189,8 +189,10 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
}
#ifdef HAS_SCALEROWDOWN2_AVX2
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown2_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
asm volatile (
LABELALIGN
"1: \n"
@ -213,8 +215,10 @@ void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
asm volatile (
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
@ -244,8 +248,10 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
asm volatile (
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
@ -286,8 +292,10 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
}
#endif // HAS_SCALEROWDOWN2_AVX2
void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown4_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrld $0x18,%%xmm5 \n"
@ -314,8 +322,10 @@ void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
intptr_t stridex3;
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
@ -368,10 +378,11 @@ void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
#ifdef HAS_SCALEROWDOWN4_AVX2
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown4_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
asm volatile (
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrld $0x18,%%ymm5,%%ymm5 \n"
@ -400,8 +411,10 @@ void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
asm volatile (
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
@ -455,17 +468,19 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
}
#endif // HAS_SCALEROWDOWN4_AVX2
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"movdqa %0,%%xmm3 \n"
"movdqa %1,%%xmm4 \n"
"movdqa %2,%%xmm5 \n"
:
: "m"(kShuf0), // %0
"m"(kShuf1), // %1
"m"(kShuf2) // %2
);
void ScaleRowDown34_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
asm volatile(
"movdqa %0,%%xmm3 \n"
"movdqa %1,%%xmm4 \n"
"movdqa %2,%%xmm5 \n"
:
: "m"(kShuf0), // %0
"m"(kShuf1), // %1
"m"(kShuf2) // %2
);
asm volatile (
LABELALIGN
"1: \n"
@ -492,25 +507,26 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"movdqa %0,%%xmm2 \n" // kShuf01
"movdqa %1,%%xmm3 \n" // kShuf11
"movdqa %2,%%xmm4 \n" // kShuf21
:
: "m"(kShuf01), // %0
"m"(kShuf11), // %1
"m"(kShuf21) // %2
);
asm volatile (
"movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11
"movdqa %2,%%xmm1 \n" // kRound34
:
: "m"(kMadd01), // %0
"m"(kMadd11), // %1
"m"(kRound34) // %2
);
uint8* dst_ptr,
int dst_width) {
asm volatile(
"movdqa %0,%%xmm2 \n" // kShuf01
"movdqa %1,%%xmm3 \n" // kShuf11
"movdqa %2,%%xmm4 \n" // kShuf21
:
: "m"(kShuf01), // %0
"m"(kShuf11), // %1
"m"(kShuf21) // %2
);
asm volatile(
"movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11
"movdqa %2,%%xmm1 \n" // kRound34
:
: "m"(kMadd01), // %0
"m"(kMadd11), // %1
"m"(kRound34) // %2
);
asm volatile (
LABELALIGN
"1: \n"
@ -557,25 +573,26 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"movdqa %0,%%xmm2 \n" // kShuf01
"movdqa %1,%%xmm3 \n" // kShuf11
"movdqa %2,%%xmm4 \n" // kShuf21
:
: "m"(kShuf01), // %0
"m"(kShuf11), // %1
"m"(kShuf21) // %2
);
asm volatile (
"movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11
"movdqa %2,%%xmm1 \n" // kRound34
:
: "m"(kMadd01), // %0
"m"(kMadd11), // %1
"m"(kRound34) // %2
);
uint8* dst_ptr,
int dst_width) {
asm volatile(
"movdqa %0,%%xmm2 \n" // kShuf01
"movdqa %1,%%xmm3 \n" // kShuf11
"movdqa %2,%%xmm4 \n" // kShuf21
:
: "m"(kShuf01), // %0
"m"(kShuf11), // %1
"m"(kShuf21) // %2
);
asm volatile(
"movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11
"movdqa %2,%%xmm1 \n" // kRound34
:
: "m"(kMadd01), // %0
"m"(kMadd11), // %1
"m"(kRound34) // %2
);
asm volatile (
LABELALIGN
@ -624,8 +641,10 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
);
}
void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown38_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
asm volatile (
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
@ -655,18 +674,19 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"movdqa %0,%%xmm2 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm4 \n"
"movdqa %3,%%xmm5 \n"
:
: "m"(kShufAb0), // %0
"m"(kShufAb1), // %1
"m"(kShufAb2), // %2
"m"(kScaleAb2) // %3
);
uint8* dst_ptr,
int dst_width) {
asm volatile(
"movdqa %0,%%xmm2 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm4 \n"
"movdqa %3,%%xmm5 \n"
:
: "m"(kShufAb0), // %0
"m"(kShufAb1), // %1
"m"(kShufAb2), // %2
"m"(kScaleAb2) // %3
);
asm volatile (
LABELALIGN
"1: \n"
@ -700,17 +720,18 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"movdqa %0,%%xmm2 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
:
: "m"(kShufAc), // %0
"m"(kShufAc3), // %1
"m"(kScaleAc33) // %2
);
uint8* dst_ptr,
int dst_width) {
asm volatile(
"movdqa %0,%%xmm2 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
:
: "m"(kShufAc), // %0
"m"(kShufAc3), // %1
"m"(kScaleAc33) // %2
);
asm volatile (
LABELALIGN
"1: \n"
@ -790,7 +811,6 @@ void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
);
}
#ifdef HAS_SCALEADDROW_AVX2
// Reads 32 bytes and accumulates to 32 shorts at a time.
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
@ -823,17 +843,19 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
// Constant for making pixels signed to avoid pmaddubsw
// saturation.
static uvec8 kFsub80 =
{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
// Constant for making pixels unsigned and adding .5 for rounding.
static uvec16 kFadd40 =
{ 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
0x4040, 0x4040, 0x4040, 0x4040};
// Bilinear column filtering. SSSE3 version.
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
void ScaleFilterCols_SSSE3(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx) {
intptr_t x0, x1, temp_pixel;
asm volatile (
"movd %6,%%xmm2 \n"
@ -925,8 +947,11 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
// Reads 4 pixels, duplicates them and writes 8 pixels.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
void ScaleColsUp2_SSE2(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx) {
asm volatile (
LABELALIGN
"1: \n"
@ -950,7 +975,8 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
uint8* dst_argb,
int dst_width) {
asm volatile (
LABELALIGN
"1: \n"
@ -971,7 +997,8 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
uint8* dst_argb,
int dst_width) {
asm volatile (
LABELALIGN
"1: \n"
@ -995,7 +1022,8 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
uint8* dst_argb,
int dst_width) {
asm volatile (
LABELALIGN
"1: \n"
@ -1025,8 +1053,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
// Reads 4 pixels at a time.
// Alignment requirement: dst_argb 16 byte aligned.
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, uint8* dst_argb, int dst_width) {
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb,
int dst_width) {
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12;
asm volatile (
@ -1059,8 +1090,10 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
// Blends four 2x2 to 4x1.
// Alignment requirement: dst_argb 16 byte aligned.
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
ptrdiff_t src_stride, int src_stepx,
uint8* dst_argb, int dst_width) {
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb,
int dst_width) {
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12;
intptr_t row1 = (intptr_t)(src_stride);
@ -1102,8 +1135,11 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
);
}
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
void ScaleARGBCols_SSE2(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
intptr_t x0, x1;
asm volatile (
"movd %5,%%xmm2 \n"
@ -1171,8 +1207,11 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
// Reads 4 pixels, duplicates them and writes 8 pixels.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
asm volatile (
LABELALIGN
"1: \n"
@ -1197,26 +1236,29 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
static uvec8 kShuffleColARGB = {
0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
};
// Shuffle table for duplicating 2 fractions into 8 bytes each
static uvec8 kShuffleFractions = {
0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
};
// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
intptr_t x0, x1;
asm volatile (
"movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm5 \n"
:
: "m"(kShuffleColARGB), // %0
"m"(kShuffleFractions) // %1
);
asm volatile(
"movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm5 \n"
:
: "m"(kShuffleColARGB), // %0
"m"(kShuffleFractions) // %1
);
asm volatile (
"movd %5,%%xmm2 \n"
@ -1283,34 +1325,32 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
// Divide num by div and return as 16.16 fixed point result.
int FixedDiv_X86(int num, int div) {
asm volatile (
"cdq \n"
"shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n"
"idiv %1 \n"
"mov %0, %%eax \n"
: "+a"(num) // %0
: "c"(div) // %1
: "memory", "cc", "edx"
);
asm volatile(
"cdq \n"
"shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n"
"idiv %1 \n"
"mov %0, %%eax \n"
: "+a"(num) // %0
: "c"(div) // %1
: "memory", "cc", "edx");
return num;
}
// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
int FixedDiv1_X86(int num, int div) {
asm volatile (
"cdq \n"
"shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n"
"sub $0x10001,%%eax \n"
"sbb $0x0,%%edx \n"
"sub $0x1,%1 \n"
"idiv %1 \n"
"mov %0, %%eax \n"
: "+a"(num) // %0
: "c"(div) // %1
: "memory", "cc", "edx"
);
asm volatile(
"cdq \n"
"shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n"
"sub $0x10001,%%eax \n"
"sbb $0x0,%%edx \n"
"sub $0x1,%1 \n"
"idiv %1 \n"
"mov %0, %%eax \n"
: "+a"(num) // %0
: "c"(div) // %1
: "memory", "cc", "edx");
return num;
}

View File

@ -17,168 +17,167 @@ extern "C" {
#endif
// This module is for GCC MIPS DSPR2
#if !defined(LIBYUV_DISABLE_MIPS) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32)
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
(__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 4 \n" // iterations -> by 16
"beqz $t9, 2f \n"
" nop \n"
"srl $t9, %[dst_width], 4 \n" // iterations -> by 16
"beqz $t9, 2f \n"
" nop \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
"lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
// TODO(fbarchard): Use odd pixels instead of even.
"precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0|
"precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8|
"precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16|
"precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"addiu $t9, $t9, -1 \n"
"sw $t8, 0(%[dst]) \n"
"sw $t0, 4(%[dst]) \n"
"sw $t1, 8(%[dst]) \n"
"sw $t2, 12(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 16 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
"lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
// TODO(fbarchard): Use odd pixels instead of even.
"precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0|
"precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8|
"precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16|
"precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"addiu $t9, $t9, -1 \n"
"sw $t8, 0(%[dst]) \n"
"sw $t0, 4(%[dst]) \n"
"sw $t1, 8(%[dst]) \n"
"sw $t2, 12(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 16 \n"
"2: \n"
"andi $t9, %[dst_width], 0xf \n" // residue
"beqz $t9, 3f \n"
" nop \n"
"2: \n"
"andi $t9, %[dst_width], 0xf \n" // residue
"beqz $t9, 3f \n"
" nop \n"
"21: \n"
"lbu $t0, 0(%[src_ptr]) \n"
"addiu %[src_ptr], %[src_ptr], 2 \n"
"addiu $t9, $t9, -1 \n"
"sb $t0, 0(%[dst]) \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 1 \n"
"21: \n"
"lbu $t0, 0(%[src_ptr]) \n"
"addiu %[src_ptr], %[src_ptr], 2 \n"
"addiu $t9, $t9, -1 \n"
"sb $t0, 0(%[dst]) \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 1 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst)
: [dst_width] "r" (dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
"3: \n"
".set pop \n"
: [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
: [dst_width] "r"(dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
}
void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
const uint8* t = src_ptr + src_stride;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 3 \n" // iterations -> step 8
"bltz $t9, 2f \n"
" nop \n"
"srl $t9, %[dst_width], 3 \n" // iterations -> step 8
"bltz $t9, 2f \n"
" nop \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 0(%[t]) \n" // |19|18|17|16|
"lw $t5, 4(%[t]) \n" // |23|22|21|20|
"lw $t6, 8(%[t]) \n" // |27|26|25|24|
"lw $t7, 12(%[t]) \n" // |31|30|29|28|
"addiu $t9, $t9, -1 \n"
"srl $t8, $t0, 16 \n" // |X|X|3|2|
"ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
"ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
"raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
"raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
"shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
"shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
"srl $t8, $t1, 16 \n" // |X|X|7|6|
"ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
"ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
"raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
"raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
"shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
"shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
"srl $t8, $t2, 16 \n" // |X|X|11|10|
"ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
"ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
"raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
"raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
"shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
"shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
"srl $t8, $t3, 16 \n" // |X|X|15|14|
"ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
"ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
"raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
"raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
"shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
"shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
"addiu %[src_ptr], %[src_ptr], 16 \n"
"addiu %[t], %[t], 16 \n"
"sb $t0, 0(%[dst]) \n"
"sb $t4, 1(%[dst]) \n"
"sb $t1, 2(%[dst]) \n"
"sb $t5, 3(%[dst]) \n"
"sb $t2, 4(%[dst]) \n"
"sb $t6, 5(%[dst]) \n"
"sb $t3, 6(%[dst]) \n"
"sb $t7, 7(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 8 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 0(%[t]) \n" // |19|18|17|16|
"lw $t5, 4(%[t]) \n" // |23|22|21|20|
"lw $t6, 8(%[t]) \n" // |27|26|25|24|
"lw $t7, 12(%[t]) \n" // |31|30|29|28|
"addiu $t9, $t9, -1 \n"
"srl $t8, $t0, 16 \n" // |X|X|3|2|
"ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
"ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
"raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
"raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
"shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
"shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
"srl $t8, $t1, 16 \n" // |X|X|7|6|
"ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
"ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
"raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
"raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
"shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
"shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
"srl $t8, $t2, 16 \n" // |X|X|11|10|
"ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
"ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
"raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
"raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
"shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
"shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
"srl $t8, $t3, 16 \n" // |X|X|15|14|
"ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
"ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
"raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
"raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
"shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
"shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
"addiu %[src_ptr], %[src_ptr], 16 \n"
"addiu %[t], %[t], 16 \n"
"sb $t0, 0(%[dst]) \n"
"sb $t4, 1(%[dst]) \n"
"sb $t1, 2(%[dst]) \n"
"sb $t5, 3(%[dst]) \n"
"sb $t2, 4(%[dst]) \n"
"sb $t6, 5(%[dst]) \n"
"sb $t3, 6(%[dst]) \n"
"sb $t7, 7(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 8 \n"
"2: \n"
"andi $t9, %[dst_width], 0x7 \n" // x = residue
"beqz $t9, 3f \n"
" nop \n"
"2: \n"
"andi $t9, %[dst_width], 0x7 \n" // x = residue
"beqz $t9, 3f \n"
" nop \n"
"21: \n"
"lwr $t1, 0(%[src_ptr]) \n"
"lwl $t1, 3(%[src_ptr]) \n"
"lwr $t2, 0(%[t]) \n"
"lwl $t2, 3(%[t]) \n"
"srl $t8, $t1, 16 \n"
"ins $t1, $t2, 16, 16 \n"
"ins $t2, $t8, 0, 16 \n"
"raddu.w.qb $t1, $t1 \n"
"raddu.w.qb $t2, $t2 \n"
"shra_r.w $t1, $t1, 2 \n"
"shra_r.w $t2, $t2, 2 \n"
"sb $t1, 0(%[dst]) \n"
"sb $t2, 1(%[dst]) \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"addiu $t9, $t9, -2 \n"
"addiu %[t], %[t], 4 \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 2 \n"
"21: \n"
"lwr $t1, 0(%[src_ptr]) \n"
"lwl $t1, 3(%[src_ptr]) \n"
"lwr $t2, 0(%[t]) \n"
"lwl $t2, 3(%[t]) \n"
"srl $t8, $t1, 16 \n"
"ins $t1, $t2, 16, 16 \n"
"ins $t2, $t8, 0, 16 \n"
"raddu.w.qb $t1, $t1 \n"
"raddu.w.qb $t2, $t2 \n"
"shra_r.w $t1, $t1, 2 \n"
"shra_r.w $t2, $t2, 2 \n"
"sb $t1, 0(%[dst]) \n"
"sb $t2, 1(%[dst]) \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"addiu $t9, $t9, -2 \n"
"addiu %[t], %[t], 4 \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 2 \n"
"3: \n"
".set pop \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst), [t] "+r" (t)
: [dst_width] "r" (dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
: [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [t] "+r"(t)
: [dst_width] "r"(dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
}
void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
void ScaleRowDown4_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
@ -186,7 +185,7 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"beqz $t9, 2f \n"
" nop \n"
"1: \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
@ -208,12 +207,12 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 8 \n"
"2: \n"
"2: \n"
"andi $t9, %[dst_width], 7 \n" // residue
"beqz $t9, 3f \n"
" nop \n"
"21: \n"
"21: \n"
"lbu $t1, 0(%[src_ptr]) \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"addiu $t9, $t9, -1 \n"
@ -221,31 +220,30 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 1 \n"
"3: \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst)
: [dst_width] "r" (dst_width)
: "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
: [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
: [dst_width] "r"(dst_width)
: "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
}
void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown4Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
const uint8* s2 = s1 + stride;
const uint8* s3 = s2 + stride;
__asm__ __volatile__ (
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 1 \n"
"andi $t8, %[dst_width], 1 \n"
"1: \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 0(%[s1]) \n" // |7|6|5|4|
"lw $t2, 0(%[s2]) \n" // |11|10|9|8|
@ -299,23 +297,20 @@ void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"2: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst),
[s1] "+r" (s1),
[s2] "+r" (s2),
[s3] "+r" (s3)
: [dst_width] "r" (dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6","t7", "t8", "t9"
);
: [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [s1] "+r"(s1), [s2] "+r"(s2),
[s3] "+r"(s3)
: [dst_width] "r"(dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
}
void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
void ScaleRowDown34_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"1: \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
@ -347,23 +342,21 @@ void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bnez %[dst_width], 1b \n"
" addiu %[dst], %[dst], 24 \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst),
[dst_width] "+r" (dst_width)
: [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6","t7", "t8", "t9"
);
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
}
void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
__asm__ __volatile__ (
void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* d,
int dst_width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"repl.ph $t3, 3 \n" // 0x00030003
"1: \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
"rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1|
@ -400,26 +393,24 @@ void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"sb $t6, 2(%[d]) \n"
"bgtz %[dst_width], 1b \n"
" addiu %[d], %[d], 3 \n"
"3: \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[src_stride] "+r" (src_stride),
[d] "+r" (d),
[dst_width] "+r" (dst_width)
: [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
[dst_width] "+r"(dst_width)
:
: "t0", "t1", "t2", "t3",
"t4", "t5", "t6"
);
: "t0", "t1", "t2", "t3", "t4", "t5", "t6");
}
void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
__asm__ __volatile__ (
void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* d,
int dst_width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"repl.ph $t2, 3 \n" // 0x00030003
"1: \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
"rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1|
@ -452,25 +443,23 @@ void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"sb $t6, 2(%[d]) \n"
"bgtz %[dst_width], 1b \n"
" addiu %[d], %[d], 3 \n"
"3: \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[src_stride] "+r" (src_stride),
[d] "+r" (d),
[dst_width] "+r" (dst_width)
: [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
[dst_width] "+r"(dst_width)
:
: "t0", "t1", "t2", "t3",
"t4", "t5", "t6"
);
: "t0", "t1", "t2", "t3", "t4", "t5", "t6");
}
void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
void ScaleRowDown38_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"1: \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
@ -501,26 +490,24 @@ void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bgez $t8, 1b \n"
" addiu %[dst], %[dst], 12 \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst),
[dst_width] "+r" (dst_width)
: [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
:
: "t0", "t1", "t2", "t3", "t4",
"t5", "t6", "t7", "t8"
);
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
}
void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
const uint8* t = src_ptr + stride;
const int c = 0x2AAA;
__asm__ __volatile__ (
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"1: \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
"lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0|
@ -554,18 +541,16 @@ void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bgtz %[dst_width], 1b \n"
" sb $t0, -3(%[dst_ptr]) \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst_ptr] "+r" (dst_ptr),
[t] "+r" (t),
[dst_width] "+r" (dst_width)
: [c] "r" (c)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6"
);
: [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [t] "+r"(t),
[dst_width] "+r"(dst_width)
: [c] "r"(c)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6");
}
void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
stride += stride;
@ -573,11 +558,11 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
const int c1 = 0x1C71;
const int c2 = 0x2AAA;
__asm__ __volatile__ (
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"1: \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
"lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0|
@ -624,15 +609,10 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
"bgtz %[dst_width], 1b \n"
" sb $t0, -3(%[dst_ptr]) \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst_ptr] "+r" (dst_ptr),
[s1] "+r" (s1),
[s2] "+r" (s2),
[dst_width] "+r" (dst_width)
: [c1] "r" (c1), [c2] "r" (c2)
: "t0", "t1", "t2", "t3", "t4",
"t5", "t6", "t7", "t8"
);
: [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [s1] "+r"(s1),
[s2] "+r"(s2), [dst_width] "+r"(dst_width)
: [c1] "r"(c1), [c2] "r"(c2)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
}
#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
@ -641,4 +621,3 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
} // extern "C"
} // namespace libyuv
#endif

View File

@ -23,8 +23,10 @@ extern "C" {
// Provided by Fritz Koenig
// Read 32x1 throw away even pixels, and write 16x1.
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
asm volatile (
"1: \n"
// load even pixels into q0, odd into q1
@ -43,8 +45,10 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Read 32x1 average down and write 16x1.
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@ -66,8 +70,10 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Read 32x2 average down and write 16x1.
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %0 \n"
@ -95,8 +101,10 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown4_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@ -113,12 +121,14 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown4Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride;
const uint8* src_ptr2 = src_ptr + src_stride * 2;
const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile (
asm volatile (
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load up 16x4
@ -155,7 +165,8 @@ asm volatile (
// Point samples 32 pixels to 24 pixels.
void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@ -175,7 +186,8 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
@ -234,7 +246,8 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
@ -274,21 +287,19 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
}
#define HAS_SCALEROWDOWN38_NEON
static uvec8 kShuf38 =
{ 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
static uvec8 kShuf38_2 =
{ 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
static vec16 kMult38_Div6 =
{ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
static vec16 kMult38_Div9 =
{ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
static uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12,
18, 6, 14, 19, 0, 0, 0, 0};
static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12};
static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18};
// 32 -> 12
void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
asm volatile (
MEMACCESS(3)
"vld1.8 {q3}, [%3] \n"
@ -314,7 +325,8 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
// 32x3 -> 12x1
void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride * 2;
asm volatile (
@ -433,7 +445,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// 32x2 -> 12x1
void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
asm volatile (
MEMACCESS(4)
"vld1.16 {q13}, [%4] \n"
@ -530,8 +543,11 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
);
}
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
void ScaleAddRows_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint16* dst_ptr,
int src_width,
int src_height) {
const uint8* src_tmp;
asm volatile (
"1: \n"
@ -563,6 +579,7 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA8_LANE(n) \
@ -571,13 +588,17 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
// clang-format on
// The NEON version mimics this formula (from row_common.cc):
// #define BLENDER(a, b, f) (uint8)((int)(a) +
// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
void ScaleFilterCols_NEON(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_ptr;
@ -640,8 +661,10 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
// 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
const uint8* src_ptr,
ptrdiff_t src_stride,
int dst_width,
int source_y_fraction) {
asm volatile (
"cmp %4, #0 \n"
"beq 100f \n"
@ -737,8 +760,10 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
);
}
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
asm volatile (
"1: \n"
// load even pixels into q0, odd into q1
@ -760,8 +785,10 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb,
int dst_width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@ -788,8 +815,10 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
);
}
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
@ -829,8 +858,11 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, uint8* dst_argb, int dst_width) {
void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb,
int dst_width) {
asm volatile (
"mov r12, %3, lsl #2 \n"
"1: \n"
@ -856,9 +888,11 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width) {
uint8* dst_argb,
int dst_width) {
asm volatile (
"mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n"
@ -902,17 +936,22 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
);
}
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD1_DATA32_LANE(dn, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld1.32 {"#dn"["#n"]}, [%6] \n"
#define LOAD1_DATA32_LANE(dn, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld1.32 {" #dn "[" #n "]}, [%6] \n"
// clang-format on
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
void ScaleARGBCols_NEON(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
int tmp;
const uint8* src_tmp = src_argb;
asm volatile (
@ -944,17 +983,22 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
#undef LOAD1_DATA32_LANE
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA32_LANE(dn1, dn2, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld2.32 {"#dn1"["#n"], "#dn2"["#n"]}, [%6] \n"
#define LOAD2_DATA32_LANE(dn1, dn2, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
// clang-format on
void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
void ScaleARGBFilterCols_NEON(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_argb;

View File

@ -21,8 +21,10 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
// Read 32x1 throw away even pixels, and write 16x1.
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
asm volatile (
"1: \n"
// load even pixels into v0, odd into v1
@ -41,8 +43,10 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Read 32x1 average down and write 16x1.
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@ -64,8 +68,10 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Read 32x2 average down and write 16x1.
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
@ -93,8 +99,10 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown4_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@ -111,12 +119,14 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown4Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride;
const uint8* src_ptr2 = src_ptr + src_stride * 2;
const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile (
asm volatile (
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
@ -152,7 +162,8 @@ asm volatile (
// Point samples 32 pixels to 24 pixels.
void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@ -172,7 +183,8 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
asm volatile (
"movi v20.8b, #3 \n"
"add %3, %3, %0 \n"
@ -232,7 +244,8 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
asm volatile (
"movi v20.8b, #3 \n"
"add %3, %3, %0 \n"
@ -273,21 +286,19 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
);
}
static uvec8 kShuf38 =
{ 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
static uvec8 kShuf38_2 =
{ 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
static vec16 kMult38_Div6 =
{ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
static vec16 kMult38_Div9 =
{ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
static uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20,
34, 6, 22, 35, 0, 0, 0, 0};
static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12};
static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18};
// 32 -> 12
void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
asm volatile (
MEMACCESS(3)
"ld1 {v3.16b}, [%3] \n"
@ -312,7 +323,8 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
// 32x3 -> 12x1
void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride * 2;
ptrdiff_t tmp_src_stride = src_stride;
@ -441,7 +453,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// 32x2 -> 12x1
void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
// TODO(fbarchard): use src_stride directly for clang 3.5+.
ptrdiff_t tmp_src_stride = src_stride;
asm volatile (
@ -545,8 +558,11 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
);
}
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
void ScaleAddRows_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint16* dst_ptr,
int src_width,
int src_height) {
const uint8* src_tmp;
asm volatile (
"1: \n"
@ -578,27 +594,32 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA8_LANE(n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {v4.b, v5.b}["#n"], [%6] \n"
#define LOAD2_DATA8_LANE(n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {v4.b, v5.b}[" #n "], [%6] \n"
// clang-format on
// The NEON version mimics this formula (from row_common.cc):
// #define BLENDER(a, b, f) (uint8)((int)(a) +
// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
void ScaleFilterCols_NEON(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_ptr;
int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
int64 x64 = (int64) x;
int64 dx64 = (int64) dx;
int64 dst_width64 = (int64)dst_width; // Work around ios 64 bit warning.
int64 x64 = (int64)x;
int64 dx64 = (int64)dx;
asm volatile (
"dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx
@ -658,9 +679,11 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
// 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
int y_fraction = 256 - source_y_fraction;
const uint8* src_ptr,
ptrdiff_t src_stride,
int dst_width,
int source_y_fraction) {
int y_fraction = 256 - source_y_fraction;
asm volatile (
"cmp %w4, #0 \n"
"b.eq 100f \n"
@ -756,8 +779,10 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
);
}
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
asm volatile (
"1: \n"
// load even pixels into q0, odd into q1
@ -779,8 +804,10 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb,
int dst_width) {
asm volatile (
"1: \n"
MEMACCESS (0)
@ -806,8 +833,10 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
);
}
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
@ -843,8 +872,11 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, uint8* dst_argb, int dst_width) {
void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb,
int dst_width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@ -871,9 +903,11 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
// Alignment requirement: src_argb 4 byte aligned.
// TODO(Yang Zhang): Might be worth another optimization pass in future.
// It could be upgraded to 8 pixels at a time to start with.
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width) {
uint8* dst_argb,
int dst_width) {
asm volatile (
"add %1, %1, %0 \n"
"1: \n"
@ -920,21 +954,26 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
);
}
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD1_DATA32_LANE(vn, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld1 {"#vn".s}["#n"], [%6] \n"
#define LOAD1_DATA32_LANE(vn, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld1 {" #vn ".s}[" #n "], [%6] \n"
// clang-format on
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
void ScaleARGBCols_NEON(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
const uint8* src_tmp = src_argb;
int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
int64 x64 = (int64) x;
int64 dx64 = (int64) dx;
int64 dst_width64 = (int64)dst_width; // Work around ios 64 bit warning.
int64 x64 = (int64)x;
int64 dx64 = (int64)dx;
int64 tmp64;
asm volatile (
"1: \n"
@ -965,23 +1004,28 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
#undef LOAD1_DATA32_LANE
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA32_LANE(vn1, vn2, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n"
#define LOAD2_DATA32_LANE(vn1, vn2, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n"
// clang-format on
void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
void ScaleARGBFilterCols_NEON(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_argb;
int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
int64 x64 = (int64) x;
int64 dx64 = (int64) dx;
int64 dst_width64 = (int64)dst_width; // Work around ios 64 bit warning.
int64 x64 = (int64)x;
int64 dx64 = (int64)dx;
asm volatile (
"dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx

File diff suppressed because it is too large Load Diff

View File

@ -8,7 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/video_common.h"
#ifdef __cplusplus
@ -24,24 +23,24 @@ struct FourCCAliasEntry {
};
static const struct FourCCAliasEntry kFourCCAliases[] = {
{FOURCC_IYUV, FOURCC_I420},
{FOURCC_YU12, FOURCC_I420},
{FOURCC_YU16, FOURCC_I422},
{FOURCC_YU24, FOURCC_I444},
{FOURCC_YUYV, FOURCC_YUY2},
{FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs
{FOURCC_HDYC, FOURCC_UYVY},
{FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8
{FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not.
{FOURCC_DMB1, FOURCC_MJPG},
{FOURCC_BA81, FOURCC_BGGR}, // deprecated.
{FOURCC_RGB3, FOURCC_RAW },
{FOURCC_BGR3, FOURCC_24BG},
{FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB
{FOURCC_CM24, FOURCC_RAW }, // kCMPixelFormat_24RGB
{FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555
{FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565
{FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551
{FOURCC_IYUV, FOURCC_I420},
{FOURCC_YU12, FOURCC_I420},
{FOURCC_YU16, FOURCC_I422},
{FOURCC_YU24, FOURCC_I444},
{FOURCC_YUYV, FOURCC_YUY2},
{FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs
{FOURCC_HDYC, FOURCC_UYVY},
{FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8
{FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not.
{FOURCC_DMB1, FOURCC_MJPG},
{FOURCC_BA81, FOURCC_BGGR}, // deprecated.
{FOURCC_RGB3, FOURCC_RAW},
{FOURCC_BGR3, FOURCC_24BG},
{FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB
{FOURCC_CM24, FOURCC_RAW}, // kCMPixelFormat_24RGB
{FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555
{FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565
{FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551
};
// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
// {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA
@ -62,4 +61,3 @@ uint32 CanonicalFourCC(uint32 fourcc) {
} // extern "C"
} // namespace libyuv
#endif

View File

@ -10,13 +10,13 @@
#include <stdlib.h>
#include "../unit_test/unit_test.h"
#include "libyuv/basic_types.h"
#include "libyuv/convert.h"
#include "libyuv/convert_argb.h"
#include "libyuv/convert_from.h"
#include "libyuv/convert_from_argb.h"
#include "libyuv/cpu_id.h"
#include "../unit_test/unit_test.h"
namespace libyuv {
@ -38,110 +38,103 @@ namespace libyuv {
#define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF) \
TEST_F(LibYUVColorTest, TESTNAME) { \
const int kPixels = benchmark_width_ * benchmark_height_; \
const int kHalfPixels = ((benchmark_width_ + 1) / 2) * \
((benchmark_height_ + HS1) / HS); \
align_buffer_page_end(orig_y, kPixels); \
align_buffer_page_end(orig_u, kHalfPixels); \
align_buffer_page_end(orig_v, kHalfPixels); \
align_buffer_page_end(orig_pixels, kPixels * 4); \
align_buffer_page_end(temp_y, kPixels); \
align_buffer_page_end(temp_u, kHalfPixels); \
align_buffer_page_end(temp_v, kHalfPixels); \
align_buffer_page_end(dst_pixels_opt, kPixels * 4); \
align_buffer_page_end(dst_pixels_c, kPixels * 4); \
const int kPixels = benchmark_width_ * benchmark_height_; \
const int kHalfPixels = \
((benchmark_width_ + 1) / 2) * ((benchmark_height_ + HS1) / HS); \
align_buffer_page_end(orig_y, kPixels); \
align_buffer_page_end(orig_u, kHalfPixels); \
align_buffer_page_end(orig_v, kHalfPixels); \
align_buffer_page_end(orig_pixels, kPixels * 4); \
align_buffer_page_end(temp_y, kPixels); \
align_buffer_page_end(temp_u, kHalfPixels); \
align_buffer_page_end(temp_v, kHalfPixels); \
align_buffer_page_end(dst_pixels_opt, kPixels * 4); \
align_buffer_page_end(dst_pixels_c, kPixels * 4); \
\
MemRandomize(orig_pixels, kPixels * 4); \
MemRandomize(orig_y, kPixels); \
MemRandomize(orig_u, kHalfPixels); \
MemRandomize(orig_v, kHalfPixels); \
MemRandomize(temp_y, kPixels); \
MemRandomize(temp_u, kHalfPixels); \
MemRandomize(temp_v, kHalfPixels); \
MemRandomize(dst_pixels_opt, kPixels * 4); \
MemRandomize(dst_pixels_c, kPixels * 4); \
MemRandomize(orig_pixels, kPixels * 4); \
MemRandomize(orig_y, kPixels); \
MemRandomize(orig_u, kHalfPixels); \
MemRandomize(orig_v, kHalfPixels); \
MemRandomize(temp_y, kPixels); \
MemRandomize(temp_u, kHalfPixels); \
MemRandomize(temp_v, kHalfPixels); \
MemRandomize(dst_pixels_opt, kPixels * 4); \
MemRandomize(dst_pixels_c, kPixels * 4); \
\
/* The test is overall for color conversion matrix being reversible, so */ \
/* this initializes the pixel with 2x2 blocks to eliminate subsampling. */ \
uint8* p = orig_y; \
for (int y = 0; y < benchmark_height_ - HS1; y += HS) { \
for (int x = 0; x < benchmark_width_ - 1; x += 2) { \
uint8 r = static_cast<uint8>(fastrand()); \
p[0] = r; \
p[1] = r; \
p[HN] = r; \
p[HN + 1] = r; \
p += 2; \
/* The test is overall for color conversion matrix being reversible, so */ \
/* this initializes the pixel with 2x2 blocks to eliminate subsampling. */ \
uint8* p = orig_y; \
for (int y = 0; y < benchmark_height_ - HS1; y += HS) { \
for (int x = 0; x < benchmark_width_ - 1; x += 2) { \
uint8 r = static_cast<uint8>(fastrand()); \
p[0] = r; \
p[1] = r; \
p[HN] = r; \
p[HN + 1] = r; \
p += 2; \
} \
if (benchmark_width_ & 1) { \
uint8 r = static_cast<uint8>(fastrand()); \
p[0] = r; \
p[HN] = r; \
p += 1; \
} \
p += HN; \
} \
if (benchmark_width_ & 1) { \
uint8 r = static_cast<uint8>(fastrand()); \
p[0] = r; \
p[HN] = r; \
p += 1; \
if ((benchmark_height_ & 1) && HS == 2) { \
for (int x = 0; x < benchmark_width_ - 1; x += 2) { \
uint8 r = static_cast<uint8>(fastrand()); \
p[0] = r; \
p[1] = r; \
p += 2; \
} \
if (benchmark_width_ & 1) { \
uint8 r = static_cast<uint8>(fastrand()); \
p[0] = r; \
p += 1; \
} \
} \
p += HN; \
} \
if ((benchmark_height_ & 1) && HS == 2) { \
for (int x = 0; x < benchmark_width_ - 1; x += 2) { \
uint8 r = static_cast<uint8>(fastrand()); \
p[0] = r; \
p[1] = r; \
p += 2; \
/* Start with YUV converted to ARGB. */ \
YUVTOARGB(orig_y, benchmark_width_, orig_u, (benchmark_width_ + 1) / 2, \
orig_v, (benchmark_width_ + 1) / 2, orig_pixels, \
benchmark_width_ * 4, benchmark_width_, benchmark_height_); \
\
ARGBTOYUV(orig_pixels, benchmark_width_ * 4, temp_y, benchmark_width_, \
temp_u, (benchmark_width_ + 1) / 2, temp_v, \
(benchmark_width_ + 1) / 2, benchmark_width_, \
benchmark_height_); \
\
MaskCpuFlags(disable_cpu_flags_); \
YUVTOARGB(temp_y, benchmark_width_, temp_u, (benchmark_width_ + 1) / 2, \
temp_v, (benchmark_width_ + 1) / 2, dst_pixels_c, \
benchmark_width_ * 4, benchmark_width_, benchmark_height_); \
MaskCpuFlags(benchmark_cpu_info_); \
\
for (int i = 0; i < benchmark_iterations_; ++i) { \
YUVTOARGB(temp_y, benchmark_width_, temp_u, (benchmark_width_ + 1) / 2, \
temp_v, (benchmark_width_ + 1) / 2, dst_pixels_opt, \
benchmark_width_ * 4, benchmark_width_, benchmark_height_); \
} \
if (benchmark_width_ & 1) { \
uint8 r = static_cast<uint8>(fastrand()); \
p[0] = r; \
p += 1; \
/* Test C and SIMD match. */ \
for (int i = 0; i < kPixels * 4; ++i) { \
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \
} \
/* Test SIMD is close to original. */ \
for (int i = 0; i < kPixels * 4; ++i) { \
EXPECT_NEAR(static_cast<int>(orig_pixels[i]), \
static_cast<int>(dst_pixels_opt[i]), DIFF); \
} \
} \
/* Start with YUV converted to ARGB. */ \
YUVTOARGB(orig_y, benchmark_width_, \
orig_u, (benchmark_width_ + 1) / 2, \
orig_v, (benchmark_width_ + 1) / 2, \
orig_pixels, benchmark_width_ * 4, \
benchmark_width_, benchmark_height_); \
\
ARGBTOYUV(orig_pixels, benchmark_width_ * 4, \
temp_y, benchmark_width_, \
temp_u, (benchmark_width_ + 1) / 2, \
temp_v, (benchmark_width_ + 1) / 2, \
benchmark_width_, benchmark_height_); \
\
MaskCpuFlags(disable_cpu_flags_); \
YUVTOARGB(temp_y, benchmark_width_, \
temp_u, (benchmark_width_ + 1) / 2, \
temp_v, (benchmark_width_ + 1) / 2, \
dst_pixels_c, benchmark_width_ * 4, \
benchmark_width_, benchmark_height_); \
MaskCpuFlags(benchmark_cpu_info_); \
\
for (int i = 0; i < benchmark_iterations_; ++i) { \
YUVTOARGB(temp_y, benchmark_width_, \
temp_u, (benchmark_width_ + 1) / 2, \
temp_v, (benchmark_width_ + 1) / 2, \
dst_pixels_opt, benchmark_width_ * 4, \
benchmark_width_, benchmark_height_); \
} \
/* Test C and SIMD match. */ \
for (int i = 0; i < kPixels * 4; ++i) { \
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \
} \
/* Test SIMD is close to original. */ \
for (int i = 0; i < kPixels * 4; ++i) { \
EXPECT_NEAR(static_cast<int>(orig_pixels[i]), \
static_cast<int>(dst_pixels_opt[i]), DIFF); \
} \
\
free_aligned_buffer_page_end(orig_pixels); \
free_aligned_buffer_page_end(orig_y); \
free_aligned_buffer_page_end(orig_u); \
free_aligned_buffer_page_end(orig_v); \
free_aligned_buffer_page_end(temp_y); \
free_aligned_buffer_page_end(temp_u); \
free_aligned_buffer_page_end(temp_v); \
free_aligned_buffer_page_end(dst_pixels_opt); \
free_aligned_buffer_page_end(dst_pixels_c); \
} \
free_aligned_buffer_page_end(orig_pixels); \
free_aligned_buffer_page_end(orig_y); \
free_aligned_buffer_page_end(orig_u); \
free_aligned_buffer_page_end(orig_v); \
free_aligned_buffer_page_end(temp_y); \
free_aligned_buffer_page_end(temp_u); \
free_aligned_buffer_page_end(temp_v); \
free_aligned_buffer_page_end(dst_pixels_opt); \
free_aligned_buffer_page_end(dst_pixels_c); \
}
TESTCS(TestI420, I420ToARGB, ARGBToI420, 1, 2, benchmark_width_, ERROR_FULL)
TESTCS(TestI422, I422ToARGB, ARGBToI422, 0, 1, 0, ERROR_FULL)
@ -163,11 +156,8 @@ static void YUVToRGB(int y, int u, int v, int* r, int* g, int* b) {
memset(orig_v, v, kHalfPixels);
/* YUV converted to ARGB. */
I422ToARGB(orig_y, kWidth,
orig_u, (kWidth + 1) / 2,
orig_v, (kWidth + 1) / 2,
orig_pixels, kWidth * 4,
kWidth, kHeight);
I422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
orig_pixels, kWidth * 4, kWidth, kHeight);
*b = orig_pixels[0];
*g = orig_pixels[1];
@ -189,11 +179,8 @@ static void YUVJToRGB(int y, int u, int v, int* r, int* g, int* b) {
memset(orig_v, v, kHalfPixels);
/* YUV converted to ARGB. */
J422ToARGB(orig_y, kWidth,
orig_u, (kWidth + 1) / 2,
orig_v, (kWidth + 1) / 2,
orig_pixels, kWidth * 4,
kWidth, kHeight);
J422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
orig_pixels, kWidth * 4, kWidth, kHeight);
*b = orig_pixels[0];
*g = orig_pixels[1];
@ -248,7 +235,7 @@ static void YJToRGB(int y, int* r, int* g, int* b) {
#if defined(CLAMPMETHOD_IF)
static int RoundToByte(float f) {
int i = ROUND(f);
int i = ROUND(f);
if (i < 0) {
i = 0;
}
@ -259,52 +246,61 @@ static int RoundToByte(float f) {
}
#elif defined(CLAMPMETHOD_TABLE)
static const unsigned char clamptable[811] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103,
104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133,
134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163,
164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178,
179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193,
194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238,
239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
};
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173,
174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188,
189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218,
219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233,
234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248,
249, 250, 251, 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255};
static int RoundToByte(float f) {
return clamptable[ROUND(f) + 276];
@ -317,7 +313,7 @@ static int RoundToByte(float f) {
#elif defined(CLAMPMETHOD_MASK)
static int RoundToByte(float f) {
int i = ROUND(f);
i = ((-(i) >> 31) & (i)); // clamp to 0.
i = ((-(i) >> 31) & (i)); // clamp to 0.
return (((255 - (i)) >> 31) | (i)) & 255; // clamp to 255.
}
#endif
@ -433,7 +429,6 @@ TEST_F(LibYUVColorTest, TestGreyYUV) {
EXPECT_EQ(130, g1);
EXPECT_EQ(130, b1);
for (int y = 0; y < 256; ++y) {
YUVToRGBReference(y, 128, 128, &r0, &g0, &b0);
YUVToRGB(y, 128, 128, &r1, &g1, &b1);
@ -477,7 +472,17 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
}
TEST_F(LibYUVColorTest, TestFullYUV) {
int rh[256] = { 0, }, gh[256] = { 0, }, bh[256] = { 0, };
int rh[256] =
{
0,
},
gh[256] =
{
0,
},
bh[256] = {
0,
};
for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; ++y2) {
@ -498,7 +503,17 @@ TEST_F(LibYUVColorTest, TestFullYUV) {
}
TEST_F(LibYUVColorTest, TestFullYUVJ) {
int rh[256] = { 0, }, gh[256] = { 0, }, bh[256] = { 0, };
int rh[256] =
{
0,
},
gh[256] =
{
0,
},
bh[256] = {
0,
};
for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; ++y2) {

View File

@ -36,7 +36,8 @@ TEST_F(LibYUVBaseTest, Djb2_Test) {
align_buffer_page_end(src_a, kMaxTest);
align_buffer_page_end(src_b, kMaxTest);
const char* fox = "The quick brown fox jumps over the lazy dog"
const char* fox =
"The quick brown fox jumps over the lazy dog"
" and feels as if he were in the seventh heaven of typography"
" together with Hermann Zapf";
uint32 foxhash = HashDjb2(reinterpret_cast<const uint8*>(fox), 131, 5381);
@ -155,19 +156,19 @@ TEST_F(LibYUVBaseTest, BenchmarkARGBDetect_Opt) {
}
src_a[0] = 0;
fourcc = ARGBDetect(src_a, benchmark_width_ * 4,
benchmark_width_, benchmark_height_);
fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_,
benchmark_height_);
EXPECT_EQ(static_cast<uint32>(libyuv::FOURCC_BGRA), fourcc);
src_a[0] = 255;
src_a[3] = 0;
fourcc = ARGBDetect(src_a, benchmark_width_ * 4,
benchmark_width_, benchmark_height_);
fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_,
benchmark_height_);
EXPECT_EQ(static_cast<uint32>(libyuv::FOURCC_ARGB), fourcc);
src_a[3] = 255;
for (int i = 0; i < benchmark_iterations_; ++i) {
fourcc = ARGBDetect(src_a, benchmark_width_ * 4,
benchmark_width_, benchmark_height_);
fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_,
benchmark_height_);
}
EXPECT_EQ(0u, fourcc);
@ -183,19 +184,19 @@ TEST_F(LibYUVBaseTest, BenchmarkARGBDetect_Unaligned) {
}
src_a[0 + 1] = 0;
fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4,
benchmark_width_, benchmark_height_);
fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_,
benchmark_height_);
EXPECT_EQ(static_cast<uint32>(libyuv::FOURCC_BGRA), fourcc);
src_a[0 + 1] = 255;
src_a[3 + 1] = 0;
fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4,
benchmark_width_, benchmark_height_);
fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_,
benchmark_height_);
EXPECT_EQ(static_cast<uint32>(libyuv::FOURCC_ARGB), fourcc);
src_a[3 + 1] = 255;
for (int i = 0; i < benchmark_iterations_; ++i) {
fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4,
benchmark_width_, benchmark_height_);
fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_,
benchmark_height_);
}
EXPECT_EQ(0u, fourcc);
@ -220,8 +221,9 @@ TEST_F(LibYUVBaseTest, BenchmarkSumSquareError_Opt) {
memset(src_a, 0, kMaxWidth);
memset(src_b, 0, kMaxWidth);
int count = benchmark_iterations_ *
((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
int count =
benchmark_iterations_ *
((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
for (int i = 0; i < count; ++i) {
h1 = ComputeSumSquareError(src_a, src_b, kMaxWidth);
}
@ -284,8 +286,7 @@ TEST_F(LibYUVBaseTest, BenchmarkPsnr_Opt) {
double opt_time = get_time();
for (int i = 0; i < benchmark_iterations_; ++i)
CalcFramePsnr(src_a, benchmark_width_,
src_b, benchmark_width_,
CalcFramePsnr(src_a, benchmark_width_, src_b, benchmark_width_,
benchmark_width_, benchmark_height_);
opt_time = (get_time() - opt_time) / benchmark_iterations_;
@ -309,8 +310,7 @@ TEST_F(LibYUVBaseTest, BenchmarkPsnr_Unaligned) {
double opt_time = get_time();
for (int i = 0; i < benchmark_iterations_; ++i)
CalcFramePsnr(src_a + 1, benchmark_width_,
src_b, benchmark_width_,
CalcFramePsnr(src_a + 1, benchmark_width_, src_b, benchmark_width_,
benchmark_width_, benchmark_height_);
opt_time = (get_time() - opt_time) / benchmark_iterations_;
@ -335,24 +335,24 @@ TEST_F(LibYUVBaseTest, Psnr) {
double err;
err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
src_b + kSrcStride * b + b, kSrcStride,
kSrcWidth, kSrcHeight);
src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
kSrcHeight);
EXPECT_EQ(err, kMaxPsnr);
memset(src_a, 255, kSrcPlaneSize);
err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
src_b + kSrcStride * b + b, kSrcStride,
kSrcWidth, kSrcHeight);
src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
kSrcHeight);
EXPECT_EQ(err, 0.0);
memset(src_a, 1, kSrcPlaneSize);
err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
src_b + kSrcStride * b + b, kSrcStride,
kSrcWidth, kSrcHeight);
src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
kSrcHeight);
EXPECT_GT(err, 48.0);
EXPECT_LT(err, 49.0);
@ -362,8 +362,8 @@ TEST_F(LibYUVBaseTest, Psnr) {
}
err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
src_b + kSrcStride * b + b, kSrcStride,
kSrcWidth, kSrcHeight);
src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
kSrcHeight);
EXPECT_GT(err, 2.0);
if (kSrcWidth * kSrcHeight >= 256) {
@ -384,14 +384,14 @@ TEST_F(LibYUVBaseTest, Psnr) {
double c_err, opt_err;
c_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
src_b + kSrcStride * b + b, kSrcStride,
kSrcWidth, kSrcHeight);
src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
kSrcHeight);
MaskCpuFlags(benchmark_cpu_info_);
opt_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
src_b + kSrcStride * b + b, kSrcStride,
kSrcWidth, kSrcHeight);
src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
kSrcHeight);
EXPECT_EQ(opt_err, c_err);
@ -411,8 +411,7 @@ TEST_F(LibYUVBaseTest, DISABLED_BenchmarkSsim_Opt) {
double opt_time = get_time();
for (int i = 0; i < benchmark_iterations_; ++i)
CalcFrameSsim(src_a, benchmark_width_,
src_b, benchmark_width_,
CalcFrameSsim(src_a, benchmark_width_, src_b, benchmark_width_,
benchmark_width_, benchmark_height_);
opt_time = (get_time() - opt_time) / benchmark_iterations_;
@ -435,14 +434,14 @@ TEST_F(LibYUVBaseTest, Ssim) {
memset(src_a, 0, kSrcPlaneSize);
memset(src_b, 0, kSrcPlaneSize);
if (kSrcWidth <=8 || kSrcHeight <= 8) {
if (kSrcWidth <= 8 || kSrcHeight <= 8) {
printf("warning - Ssim size too small. Testing function executes.\n");
}
double err;
err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
src_b + kSrcStride * b + b, kSrcStride,
kSrcWidth, kSrcHeight);
src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
kSrcHeight);
if (kSrcWidth > 8 && kSrcHeight > 8) {
EXPECT_EQ(err, 1.0);
@ -451,8 +450,8 @@ TEST_F(LibYUVBaseTest, Ssim) {
memset(src_a, 255, kSrcPlaneSize);
err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
src_b + kSrcStride * b + b, kSrcStride,
kSrcWidth, kSrcHeight);
src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
kSrcHeight);
if (kSrcWidth > 8 && kSrcHeight > 8) {
EXPECT_LT(err, 0.0001);
@ -461,8 +460,8 @@ TEST_F(LibYUVBaseTest, Ssim) {
memset(src_a, 1, kSrcPlaneSize);
err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
src_b + kSrcStride * b + b, kSrcStride,
kSrcWidth, kSrcHeight);
src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
kSrcHeight);
if (kSrcWidth > 8 && kSrcHeight > 8) {
EXPECT_GT(err, 0.0001);
@ -474,8 +473,8 @@ TEST_F(LibYUVBaseTest, Ssim) {
}
err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
src_b + kSrcStride * b + b, kSrcStride,
kSrcWidth, kSrcHeight);
src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
kSrcHeight);
if (kSrcWidth > 8 && kSrcHeight > 8) {
EXPECT_GT(err, 0.0);
@ -493,14 +492,14 @@ TEST_F(LibYUVBaseTest, Ssim) {
double c_err, opt_err;
c_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
src_b + kSrcStride * b + b, kSrcStride,
kSrcWidth, kSrcHeight);
src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
kSrcHeight);
MaskCpuFlags(benchmark_cpu_info_);
opt_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
src_b + kSrcStride * b + b, kSrcStride,
kSrcWidth, kSrcHeight);
src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
kSrcHeight);
if (kSrcWidth > 8 && kSrcHeight > 8) {
EXPECT_EQ(opt_err, c_err);

File diff suppressed because it is too large Load Diff

View File

@ -11,10 +11,10 @@
#include <stdlib.h>
#include <string.h>
#include "../unit_test/unit_test.h"
#include "libyuv/basic_types.h"
#include "libyuv/cpu_id.h"
#include "libyuv/version.h"
#include "../unit_test/unit_test.h"
namespace libyuv {
@ -64,19 +64,20 @@ TEST_F(LibYUVBaseTest, TestCpuCompilerEnabled) {
printf("x64 build\n");
#endif
#ifdef _MSC_VER
printf("_MSC_VER %d\n", _MSC_VER);
printf("_MSC_VER %d\n", _MSC_VER);
#endif
#if !defined(LIBYUV_DISABLE_X86) && (defined(GCC_HAS_AVX2) || \
defined(CLANG_HAS_AVX2) || defined(VISUALC_HAS_AVX2))
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(GCC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
defined(VISUALC_HAS_AVX2))
printf("Has AVX2 1\n");
#else
printf("Has AVX2 0\n");
// If compiler does not support AVX2, the following function not expected:
// If compiler does not support AVX2, the following function not expected:
#endif
}
#if defined(__i386__) || defined(__x86_64__) || \
defined(_M_IX86) || defined(_M_X64)
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
defined(_M_X64)
TEST_F(LibYUVBaseTest, TestCpuId) {
int has_x86 = TestCpuFlag(kCpuHasX86);
if (has_x86) {
@ -110,8 +111,8 @@ TEST_F(LibYUVBaseTest, TestCpuId) {
CpuId(1, 0, cpu_info);
int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
model, model);
printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, model,
model);
}
}
#endif

View File

@ -12,11 +12,11 @@
#include <string.h>
#include <time.h>
#include "../unit_test/unit_test.h"
#include "libyuv/basic_types.h"
#include "libyuv/cpu_id.h"
#include "libyuv/scale.h"
#include "libyuv/scale_row.h"
#include "../unit_test/unit_test.h"
namespace libyuv {

File diff suppressed because it is too large Load Diff

View File

@ -10,14 +10,16 @@
#include <stdlib.h>
#include "../unit_test/unit_test.h"
#include "libyuv/cpu_id.h"
#include "libyuv/rotate_argb.h"
#include "../unit_test/unit_test.h"
namespace libyuv {
void TestRotateBpp(int src_width, int src_height,
int dst_width, int dst_height,
void TestRotateBpp(int src_width,
int src_height,
int dst_width,
int dst_height,
libyuv::RotationMode mode,
int benchmark_iterations,
int disable_cpu_flags,
@ -51,26 +53,22 @@ void TestRotateBpp(int src_width, int src_height,
if (kBpp == 1) {
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
RotatePlane(src_argb, src_stride_argb,
dst_argb_c, dst_stride_argb,
RotatePlane(src_argb, src_stride_argb, dst_argb_c, dst_stride_argb,
src_width, src_height, mode);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
for (int i = 0; i < benchmark_iterations; ++i) {
RotatePlane(src_argb, src_stride_argb,
dst_argb_opt, dst_stride_argb,
RotatePlane(src_argb, src_stride_argb, dst_argb_opt, dst_stride_argb,
src_width, src_height, mode);
}
} else if (kBpp == 4) {
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
ARGBRotate(src_argb, src_stride_argb,
dst_argb_c, dst_stride_argb,
ARGBRotate(src_argb, src_stride_argb, dst_argb_c, dst_stride_argb,
src_width, src_height, mode);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
for (int i = 0; i < benchmark_iterations; ++i) {
ARGBRotate(src_argb, src_stride_argb,
dst_argb_opt, dst_stride_argb,
ARGBRotate(src_argb, src_stride_argb, dst_argb_opt, dst_stride_argb,
src_width, src_height, mode);
}
}
@ -85,112 +83,104 @@ void TestRotateBpp(int src_width, int src_height,
free_aligned_buffer_page_end(src_argb);
}
static void ARGBTestRotate(int src_width, int src_height,
int dst_width, int dst_height,
static void ARGBTestRotate(int src_width,
int src_height,
int dst_width,
int dst_height,
libyuv::RotationMode mode,
int benchmark_iterations,
int disable_cpu_flags,
int benchmark_cpu_info) {
TestRotateBpp(src_width, src_height,
dst_width, dst_height,
mode, benchmark_iterations,
disable_cpu_flags, benchmark_cpu_info, 4);
TestRotateBpp(src_width, src_height, dst_width, dst_height, mode,
benchmark_iterations, disable_cpu_flags, benchmark_cpu_info, 4);
}
TEST_F(LibYUVRotateTest, ARGBRotate0_Opt) {
ARGBTestRotate(benchmark_width_, benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate0, benchmark_iterations_,
ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
benchmark_height_, kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, ARGBRotate90_Opt) {
ARGBTestRotate(benchmark_width_, benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate90, benchmark_iterations_,
ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
benchmark_width_, kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, ARGBRotate180_Opt) {
ARGBTestRotate(benchmark_width_, benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate180, benchmark_iterations_,
ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
benchmark_height_, kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, ARGBRotate270_Opt) {
ARGBTestRotate(benchmark_width_, benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate270, benchmark_iterations_,
ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
benchmark_width_, kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
static void TestRotatePlane(int src_width, int src_height,
int dst_width, int dst_height,
static void TestRotatePlane(int src_width,
int src_height,
int dst_width,
int dst_height,
libyuv::RotationMode mode,
int benchmark_iterations,
int disable_cpu_flags,
int benchmark_cpu_info) {
TestRotateBpp(src_width, src_height,
dst_width, dst_height,
mode, benchmark_iterations,
disable_cpu_flags, benchmark_cpu_info, 1);
TestRotateBpp(src_width, src_height, dst_width, dst_height, mode,
benchmark_iterations, disable_cpu_flags, benchmark_cpu_info, 1);
}
TEST_F(LibYUVRotateTest, RotatePlane0_Opt) {
TestRotatePlane(benchmark_width_, benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_width_,
benchmark_height_, kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, RotatePlane90_Opt) {
TestRotatePlane(benchmark_width_, benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_height_,
benchmark_width_, kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, RotatePlane180_Opt) {
TestRotatePlane(benchmark_width_, benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_width_,
benchmark_height_, kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, RotatePlane270_Opt) {
TestRotatePlane(benchmark_width_, benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_height_,
benchmark_width_, kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_RotatePlane0_Odd) {
TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_width_ - 3, benchmark_height_ - 1,
kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_RotatePlane90_Odd) {
TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_height_ - 1, benchmark_width_ - 3,
kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_RotatePlane180_Odd) {
TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_width_ - 3, benchmark_height_ - 1,
kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) {
TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_height_ - 1, benchmark_width_ - 3,
kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
} // namespace libyuv

View File

@ -10,17 +10,20 @@
#include <stdlib.h>
#include "../unit_test/unit_test.h"
#include "libyuv/cpu_id.h"
#include "libyuv/rotate.h"
#include "../unit_test/unit_test.h"
namespace libyuv {
static void I420TestRotate(int src_width, int src_height,
int dst_width, int dst_height,
static void I420TestRotate(int src_width,
int src_height,
int dst_width,
int dst_height,
libyuv::RotationMode mode,
int benchmark_iterations,
int disable_cpu_flags, int benchmark_cpu_info) {
int disable_cpu_flags,
int benchmark_cpu_info) {
if (src_width < 1) {
src_width = 1;
}
@ -50,26 +53,21 @@ static void I420TestRotate(int src_width, int src_height,
memset(dst_i420_opt, 3, dst_i420_size);
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
I420Rotate(src_i420, src_width,
src_i420 + src_i420_y_size, (src_width + 1) / 2,
src_i420 + src_i420_y_size + src_i420_uv_size, (src_width + 1) / 2,
dst_i420_c, dst_width,
I420Rotate(src_i420, src_width, src_i420 + src_i420_y_size,
(src_width + 1) / 2, src_i420 + src_i420_y_size + src_i420_uv_size,
(src_width + 1) / 2, dst_i420_c, dst_width,
dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
(dst_width + 1) / 2,
src_width, src_height, mode);
(dst_width + 1) / 2, src_width, src_height, mode);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
for (int i = 0; i < benchmark_iterations; ++i) {
I420Rotate(src_i420, src_width,
src_i420 + src_i420_y_size, (src_width + 1) / 2,
src_i420 + src_i420_y_size + src_i420_uv_size,
(src_width + 1) / 2,
dst_i420_opt, dst_width,
dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2,
dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
(dst_width + 1) / 2,
src_width, src_height, mode);
I420Rotate(
src_i420, src_width, src_i420 + src_i420_y_size, (src_width + 1) / 2,
src_i420 + src_i420_y_size + src_i420_uv_size, (src_width + 1) / 2,
dst_i420_opt, dst_width, dst_i420_opt + dst_i420_y_size,
(dst_width + 1) / 2, dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
(dst_width + 1) / 2, src_width, src_height, mode);
}
// Rotation should be exact.
@ -83,30 +81,26 @@ static void I420TestRotate(int src_width, int src_height,
}
TEST_F(LibYUVRotateTest, I420Rotate0_Opt) {
I420TestRotate(benchmark_width_, benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate0, benchmark_iterations_,
I420TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
benchmark_height_, kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, I420Rotate90_Opt) {
I420TestRotate(benchmark_width_, benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate90, benchmark_iterations_,
I420TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
benchmark_width_, kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, I420Rotate180_Opt) {
I420TestRotate(benchmark_width_, benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate180, benchmark_iterations_,
I420TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
benchmark_height_, kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, I420Rotate270_Opt) {
I420TestRotate(benchmark_width_, benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate270, benchmark_iterations_,
I420TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
benchmark_width_, kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
@ -115,37 +109,40 @@ TEST_F(LibYUVRotateTest, I420Rotate270_Opt) {
// tested by passing an odd width command line or environment variable.
TEST_F(LibYUVRotateTest, DISABLED_I420Rotate0_Odd) {
I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_width_ - 3, benchmark_height_ - 1,
kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_I420Rotate90_Odd) {
I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_height_ - 1, benchmark_width_ - 3,
kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_I420Rotate180_Odd) {
I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_width_ - 3, benchmark_height_ - 1,
kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) {
I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_height_ - 1, benchmark_width_ - 3,
kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
static void NV12TestRotate(int src_width, int src_height,
int dst_width, int dst_height,
static void NV12TestRotate(int src_width,
int src_height,
int dst_width,
int dst_height,
libyuv::RotationMode mode,
int benchmark_iterations,
int disable_cpu_flags, int benchmark_cpu_info) {
int disable_cpu_flags,
int benchmark_cpu_info) {
if (src_width < 1) {
src_width = 1;
}
@ -176,23 +173,19 @@ static void NV12TestRotate(int src_width, int src_height,
memset(dst_i420_opt, 3, dst_i420_size);
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
NV12ToI420Rotate(src_nv12, src_width,
src_nv12 + src_nv12_y_size, (src_width + 1) & ~1,
dst_i420_c, dst_width,
NV12ToI420Rotate(src_nv12, src_width, src_nv12 + src_nv12_y_size,
(src_width + 1) & ~1, dst_i420_c, dst_width,
dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
(dst_width + 1) / 2,
src_width, src_height, mode);
(dst_width + 1) / 2, src_width, src_height, mode);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
for (int i = 0; i < benchmark_iterations; ++i) {
NV12ToI420Rotate(src_nv12, src_width,
src_nv12 + src_nv12_y_size, (src_width + 1) & ~1,
dst_i420_opt, dst_width,
NV12ToI420Rotate(src_nv12, src_width, src_nv12 + src_nv12_y_size,
(src_width + 1) & ~1, dst_i420_opt, dst_width,
dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2,
dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
(dst_width + 1) / 2,
src_width, src_height, mode);
(dst_width + 1) / 2, src_width, src_height, mode);
}
// Rotation should be exact.
@ -206,91 +199,79 @@ static void NV12TestRotate(int src_width, int src_height,
}
TEST_F(LibYUVRotateTest, NV12Rotate0_Opt) {
NV12TestRotate(benchmark_width_, benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate0, benchmark_iterations_,
NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
benchmark_height_, kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, NV12Rotate90_Opt) {
NV12TestRotate(benchmark_width_, benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate90, benchmark_iterations_,
NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
benchmark_width_, kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, NV12Rotate180_Opt) {
NV12TestRotate(benchmark_width_, benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate180, benchmark_iterations_,
NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
benchmark_height_, kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, NV12Rotate270_Opt) {
NV12TestRotate(benchmark_width_, benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate270, benchmark_iterations_,
NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
benchmark_width_, kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate0_Odd) {
NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_width_ - 3, benchmark_height_ - 1,
kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate90_Odd) {
NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_height_ - 1, benchmark_width_ - 3,
kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate180_Odd) {
NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_width_ - 3, benchmark_height_ - 1,
kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate270_Odd) {
NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_height_ - 1, benchmark_width_ - 3,
kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, NV12Rotate0_Invert) {
NV12TestRotate(benchmark_width_, -benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate0, benchmark_iterations_,
NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_width_,
benchmark_height_, kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, NV12Rotate90_Invert) {
NV12TestRotate(benchmark_width_, -benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate90, benchmark_iterations_,
NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_height_,
benchmark_width_, kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, NV12Rotate180_Invert) {
NV12TestRotate(benchmark_width_, -benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate180, benchmark_iterations_,
NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_width_,
benchmark_height_, kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
NV12TestRotate(benchmark_width_, -benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate270, benchmark_iterations_,
NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_height_,
benchmark_width_, kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
} // namespace libyuv

View File

@ -11,11 +11,11 @@
#include <stdlib.h>
#include <time.h>
#include "../unit_test/unit_test.h"
#include "libyuv/convert_argb.h"
#include "libyuv/cpu_id.h"
#include "libyuv/scale_argb.h"
#include "libyuv/video_common.h"
#include "../unit_test/unit_test.h"
namespace libyuv {
@ -23,18 +23,22 @@ namespace libyuv {
#define FILELINESTR(file, line) file ":" STRINGIZE(line)
// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
static int ARGBTestFilter(int src_width, int src_height,
int dst_width, int dst_height,
FilterMode f, int benchmark_iterations,
int disable_cpu_flags, int benchmark_cpu_info) {
static int ARGBTestFilter(int src_width,
int src_height,
int dst_width,
int dst_height,
FilterMode f,
int benchmark_iterations,
int disable_cpu_flags,
int benchmark_cpu_info) {
if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
return 0;
}
int i, j;
const int b = 0; // 128 to test for padding/stride.
int64 src_argb_plane_size = (Abs(src_width) + b * 2) *
(Abs(src_height) + b * 2) * 4LL;
int64 src_argb_plane_size =
(Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 4LL;
int src_stride_argb = (b * 2 + Abs(src_width)) * 4;
align_buffer_page_end(src_argb, src_argb_plane_size);
@ -59,21 +63,18 @@ static int ARGBTestFilter(int src_width, int src_height,
// Warm up both versions for consistent benchmarks.
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
src_width, src_height,
dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
dst_width, dst_height, f);
src_width, src_height, dst_argb_c + (dst_stride_argb * b) + b * 4,
dst_stride_argb, dst_width, dst_height, f);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
src_width, src_height,
dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
dst_width, dst_height, f);
src_width, src_height, dst_argb_opt + (dst_stride_argb * b) + b * 4,
dst_stride_argb, dst_width, dst_height, f);
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
double c_time = get_time();
ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
src_width, src_height,
dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
dst_width, dst_height, f);
src_width, src_height, dst_argb_c + (dst_stride_argb * b) + b * 4,
dst_stride_argb, dst_width, dst_height, f);
c_time = (get_time() - c_time);
@ -88,8 +89,8 @@ static int ARGBTestFilter(int src_width, int src_height,
opt_time = (get_time() - opt_time) / benchmark_iterations;
// Report performance of C vs OPT
printf("filter %d - %8d us C - %8d us OPT\n",
f, static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
printf("filter %d - %8d us C - %8d us OPT\n", f,
static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
// C version may be a little off from the optimized. Order of
// operations may introduce rounding somewhere. So do a difference
@ -115,10 +116,14 @@ static int ARGBTestFilter(int src_width, int src_height,
static const int kTileX = 8;
static const int kTileY = 8;
static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
int dst_width, int dst_height,
static int TileARGBScale(const uint8* src_argb,
int src_stride_argb,
int src_width,
int src_height,
uint8* dst_argb,
int dst_stride_argb,
int dst_width,
int dst_height,
FilterMode filtering) {
for (int y = 0; y < dst_height; y += kTileY) {
for (int x = 0; x < dst_width; x += kTileX) {
@ -130,11 +135,9 @@ static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
if (y + clip_height > dst_height) {
clip_height = dst_height - y;
}
int r = ARGBScaleClip(src_argb, src_stride_argb,
src_width, src_height,
dst_argb, dst_stride_argb,
dst_width, dst_height,
x, y, clip_width, clip_height, filtering);
int r = ARGBScaleClip(src_argb, src_stride_argb, src_width, src_height,
dst_argb, dst_stride_argb, dst_width, dst_height, x,
y, clip_width, clip_height, filtering);
if (r) {
return r;
}
@ -143,16 +146,19 @@ static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
return 0;
}
static int ARGBClipTestFilter(int src_width, int src_height,
int dst_width, int dst_height,
FilterMode f, int benchmark_iterations) {
static int ARGBClipTestFilter(int src_width,
int src_height,
int dst_width,
int dst_height,
FilterMode f,
int benchmark_iterations) {
if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
return 0;
}
const int b = 128;
int64 src_argb_plane_size = (Abs(src_width) + b * 2) *
(Abs(src_height) + b * 2) * 4;
int64 src_argb_plane_size =
(Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 4;
int src_stride_argb = (b * 2 + Abs(src_width)) * 4;
align_buffer_page_end(src_argb, src_argb_plane_size);
@ -184,9 +190,8 @@ static int ARGBClipTestFilter(int src_width, int src_height,
// Do full image, no clipping.
double c_time = get_time();
ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
src_width, src_height,
dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
dst_width, dst_height, f);
src_width, src_height, dst_argb_c + (dst_stride_argb * b) + b * 4,
dst_stride_argb, dst_width, dst_height, f);
c_time = (get_time() - c_time);
// Do tiled image, clipping scale to a tile at a time.
@ -200,8 +205,8 @@ static int ARGBClipTestFilter(int src_width, int src_height,
opt_time = (get_time() - opt_time) / benchmark_iterations;
// Report performance of Full vs Tiled.
printf("filter %d - %8d us Full - %8d us Tiled\n",
f, static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
printf("filter %d - %8d us Full - %8d us Tiled\n", f,
static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
// Compare full scaled image vs tiled image.
int max_diff = 0;
@ -226,32 +231,30 @@ static int ARGBClipTestFilter(int src_width, int src_height,
#define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
#define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
TEST_F(LibYUVScaleTest, ARGBScaleDownBy##name##_##filter) { \
int diff = ARGBTestFilter(SX(benchmark_width_, nom, denom), \
SX(benchmark_height_, nom, denom), \
DX(benchmark_width_, nom, denom), \
DX(benchmark_height_, nom, denom), \
kFilter##filter, benchmark_iterations_, \
disable_cpu_flags_, benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, ARGBScaleDownClipBy##name##_##filter) { \
int diff = ARGBClipTestFilter(SX(benchmark_width_, nom, denom), \
SX(benchmark_height_, nom, denom), \
DX(benchmark_width_, nom, denom), \
DX(benchmark_height_, nom, denom), \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
}
#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
TEST_F(LibYUVScaleTest, ARGBScaleDownBy##name##_##filter) { \
int diff = ARGBTestFilter( \
SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, ARGBScaleDownClipBy##name##_##filter) { \
int diff = ARGBClipTestFilter( \
SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
}
// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
// filtering is different fixed point implementations for SSSE3, Neon and C.
#define TEST_FACTOR(name, nom, denom) \
TEST_FACTOR1(name, None, nom, denom, 0) \
TEST_FACTOR1(name, Linear, nom, denom, 3) \
TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
TEST_FACTOR1(name, Box, nom, denom, 3)
#define TEST_FACTOR(name, nom, denom) \
TEST_FACTOR1(name, None, nom, denom, 0) \
TEST_FACTOR1(name, Linear, nom, denom, 3) \
TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
TEST_FACTOR1(name, Box, nom, denom, 3)
TEST_FACTOR(2, 1, 2)
TEST_FACTOR(4, 1, 4)
@ -265,39 +268,37 @@ TEST_FACTOR(3, 1, 3)
#undef DX
#define TEST_SCALETO1(name, width, height, filter, max_diff) \
TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, \
width, height, \
kFilter##filter, benchmark_iterations_, \
disable_cpu_flags_, benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \
int diff = ARGBTestFilter(width, height, \
Abs(benchmark_width_), Abs(benchmark_height_), \
kFilter##filter, benchmark_iterations_, \
disable_cpu_flags_, benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, name##ClipTo##width##x##height##_##filter) { \
int diff = ARGBClipTestFilter(benchmark_width_, benchmark_height_, \
width, height, \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, name##ClipFrom##width##x##height##_##filter) { \
int diff = ARGBClipTestFilter(width, height, \
Abs(benchmark_width_), \
Abs(benchmark_height_), \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
}
TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, width, \
height, kFilter##filter, benchmark_iterations_, \
disable_cpu_flags_, benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \
int diff = ARGBTestFilter(width, height, Abs(benchmark_width_), \
Abs(benchmark_height_), kFilter##filter, \
benchmark_iterations_, disable_cpu_flags_, \
benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, name##ClipTo##width##x##height##_##filter) { \
int diff = \
ARGBClipTestFilter(benchmark_width_, benchmark_height_, width, height, \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, name##ClipFrom##width##x##height##_##filter) { \
int diff = ARGBClipTestFilter(width, height, Abs(benchmark_width_), \
Abs(benchmark_height_), kFilter##filter, \
benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
}
/// Test scale to a specified size with all 4 filters.
#define TEST_SCALETO(name, width, height) \
TEST_SCALETO1(name, width, height, None, 0) \
TEST_SCALETO1(name, width, height, Linear, 3) \
TEST_SCALETO1(name, width, height, Bilinear, 3)
#define TEST_SCALETO(name, width, height) \
TEST_SCALETO1(name, width, height, None, 0) \
TEST_SCALETO1(name, width, height, Linear, 3) \
TEST_SCALETO1(name, width, height, Bilinear, 3)
TEST_SCALETO(ARGBScale, 1, 1)
TEST_SCALETO(ARGBScale, 320, 240)
@ -310,31 +311,33 @@ TEST_SCALETO(ARGBScale, 1280, 720)
// Scale with YUV conversion to ARGB and clipping.
LIBYUV_API
int YUVToARGBScaleReference2(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
int YUVToARGBScaleReference2(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint32 src_fourcc,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
int src_width,
int src_height,
uint8* dst_argb,
int dst_stride_argb,
uint32 dst_fourcc,
int dst_width, int dst_height,
int clip_x, int clip_y,
int clip_width, int clip_height,
int dst_width,
int dst_height,
int clip_x,
int clip_y,
int clip_width,
int clip_height,
enum FilterMode filtering) {
uint8* argb_buffer = static_cast<uint8*>(malloc(src_width * src_height * 4));
int r;
I420ToARGB(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
argb_buffer, src_width * 4,
src_width, src_height);
I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
argb_buffer, src_width * 4, src_width, src_height);
r = ARGBScaleClip(argb_buffer, src_width * 4,
src_width, src_height,
dst_argb, dst_stride_argb,
dst_width, dst_height,
clip_x, clip_y, clip_width, clip_height,
filtering);
r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb,
dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
clip_width, clip_height, filtering);
free(argb_buffer);
return r;
}
@ -360,13 +363,17 @@ static void FillRamp(uint8* buf, int width, int height, int v, int dx, int dy) {
}
// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
static int YUVToARGBTestFilter(int src_width, int src_height,
int dst_width, int dst_height,
FilterMode f, int benchmark_iterations,
int disable_cpu_flags, int benchmark_cpu_info) {
static int YUVToARGBTestFilter(int src_width,
int src_height,
int dst_width,
int dst_height,
FilterMode f,
int benchmark_iterations,
int disable_cpu_flags,
int benchmark_cpu_info) {
int64 src_y_plane_size = Abs(src_width) * Abs(src_height);
int64 src_uv_plane_size = ((Abs(src_width) + 1) / 2) *
((Abs(src_height) + 1) / 2);
int64 src_uv_plane_size =
((Abs(src_width) + 1) / 2) * ((Abs(src_height) + 1) / 2);
int src_stride_y = Abs(src_width);
int src_stride_uv = (Abs(src_width) + 1) / 2;
@ -374,8 +381,8 @@ static int YUVToARGBTestFilter(int src_width, int src_height,
align_buffer_page_end(src_u, src_uv_plane_size);
align_buffer_page_end(src_v, src_uv_plane_size);
int64 dst_argb_plane_size = (dst_width) * (dst_height) * 4LL;
int dst_stride_argb = (dst_width) * 4;
int64 dst_argb_plane_size = (dst_width) * (dst_height)*4LL;
int dst_stride_argb = (dst_width)*4;
align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
if (!dst_argb_c || !dst_argb_opt || !src_y || !src_u || !src_v) {
@ -390,28 +397,18 @@ static int YUVToARGBTestFilter(int src_width, int src_height,
memset(dst_argb_c, 2, dst_argb_plane_size);
memset(dst_argb_opt, 3, dst_argb_plane_size);
YUVToARGBScaleReference2(src_y, src_stride_y,
src_u, src_stride_uv,
src_v, src_stride_uv,
libyuv::FOURCC_I420,
src_width, src_height,
dst_argb_c, dst_stride_argb,
libyuv::FOURCC_I420,
dst_width, dst_height,
0, 0, dst_width, dst_height,
f);
YUVToARGBScaleReference2(src_y, src_stride_y, src_u, src_stride_uv, src_v,
src_stride_uv, libyuv::FOURCC_I420, src_width,
src_height, dst_argb_c, dst_stride_argb,
libyuv::FOURCC_I420, dst_width, dst_height, 0, 0,
dst_width, dst_height, f);
for (int i = 0; i < benchmark_iterations; ++i) {
YUVToARGBScaleClip(src_y, src_stride_y,
src_u, src_stride_uv,
src_v, src_stride_uv,
libyuv::FOURCC_I420,
src_width, src_height,
dst_argb_opt, dst_stride_argb,
libyuv::FOURCC_I420,
dst_width, dst_height,
0, 0, dst_width, dst_height,
f);
YUVToARGBScaleClip(src_y, src_stride_y, src_u, src_stride_uv, src_v,
src_stride_uv, libyuv::FOURCC_I420, src_width,
src_height, dst_argb_opt, dst_stride_argb,
libyuv::FOURCC_I420, dst_width, dst_height, 0, 0,
dst_width, dst_height, f);
}
int max_diff = 0;
for (int i = 0; i < dst_height; ++i) {
@ -419,9 +416,7 @@ static int YUVToARGBTestFilter(int src_width, int src_height,
int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
dst_argb_opt[(i * dst_stride_argb) + j]);
if (abs_diff > max_diff) {
printf("error %d at %d,%d c %d opt %d",
abs_diff,
j, i,
printf("error %d at %d,%d c %d opt %d", abs_diff, j, i,
dst_argb_c[(i * dst_stride_argb) + j],
dst_argb_opt[(i * dst_stride_argb) + j]);
EXPECT_LE(abs_diff, 40);
@ -439,24 +434,19 @@ static int YUVToARGBTestFilter(int src_width, int src_height,
}
TEST_F(LibYUVScaleTest, YUVToRGBScaleUp) {
int diff = YUVToARGBTestFilter(benchmark_width_, benchmark_height_,
benchmark_width_ * 3 / 2,
benchmark_height_ * 3 / 2,
libyuv::kFilterBilinear,
benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
int diff = YUVToARGBTestFilter(
benchmark_width_, benchmark_height_, benchmark_width_ * 3 / 2,
benchmark_height_ * 3 / 2, libyuv::kFilterBilinear, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
EXPECT_LE(diff, 10);
}
TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) {
int diff = YUVToARGBTestFilter(benchmark_width_ * 3 / 2,
benchmark_height_ * 3 / 2,
benchmark_width_, benchmark_height_,
libyuv::kFilterBilinear,
benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
int diff = YUVToARGBTestFilter(
benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2, benchmark_width_,
benchmark_height_, libyuv::kFilterBilinear, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
EXPECT_LE(diff, 10);
}
} // namespace libyuv

View File

@ -11,9 +11,9 @@
#include <stdlib.h>
#include <time.h>
#include "../unit_test/unit_test.h"
#include "libyuv/cpu_id.h"
#include "libyuv/scale.h"
#include "../unit_test/unit_test.h"
#define STRINGIZE(line) #line
#define FILELINESTR(file, line) file ":" STRINGIZE(line)
@ -21,10 +21,14 @@
namespace libyuv {
// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
static int TestFilter(int src_width, int src_height,
int dst_width, int dst_height,
FilterMode f, int benchmark_iterations,
int disable_cpu_flags, int benchmark_cpu_info) {
static int TestFilter(int src_width,
int src_height,
int dst_width,
int dst_height,
FilterMode f,
int benchmark_iterations,
int disable_cpu_flags,
int benchmark_cpu_info) {
if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
return 0;
}
@ -41,9 +45,8 @@ static int TestFilter(int src_width, int src_height,
int src_stride_uv = b * 2 + src_width_uv;
align_buffer_page_end(src_y, src_y_plane_size)
align_buffer_page_end(src_u, src_uv_plane_size)
align_buffer_page_end(src_v, src_uv_plane_size)
if (!src_y || !src_u || !src_v) {
align_buffer_page_end(src_u, src_uv_plane_size) align_buffer_page_end(
src_v, src_uv_plane_size) if (!src_y || !src_u || !src_v) {
printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
return 0;
}
@ -61,13 +64,15 @@ static int TestFilter(int src_width, int src_height,
int dst_stride_uv = b * 2 + dst_width_uv;
align_buffer_page_end(dst_y_c, dst_y_plane_size)
align_buffer_page_end(dst_u_c, dst_uv_plane_size)
align_buffer_page_end(dst_v_c, dst_uv_plane_size)
align_buffer_page_end(dst_y_opt, dst_y_plane_size)
align_buffer_page_end(dst_u_opt, dst_uv_plane_size)
align_buffer_page_end(dst_v_opt, dst_uv_plane_size)
if (!dst_y_c || !dst_u_c || !dst_v_c ||
!dst_y_opt|| !dst_u_opt|| !dst_v_opt) {
align_buffer_page_end(dst_u_c, dst_uv_plane_size)
align_buffer_page_end(dst_v_c, dst_uv_plane_size)
align_buffer_page_end(dst_y_opt, dst_y_plane_size)
align_buffer_page_end(dst_u_opt, dst_uv_plane_size)
align_buffer_page_end(
dst_v_opt,
dst_uv_plane_size) if (!dst_y_c || !dst_u_c ||
!dst_v_c || !dst_y_opt ||
!dst_u_opt || !dst_v_opt) {
printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
return 0;
}
@ -76,12 +81,11 @@ static int TestFilter(int src_width, int src_height,
double c_time = get_time();
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
src_u + (src_stride_uv * b) + b, src_stride_uv,
src_v + (src_stride_uv * b) + b, src_stride_uv,
src_width, src_height,
dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
src_v + (src_stride_uv * b) + b, src_stride_uv, src_width,
src_height, dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
dst_width, dst_height, f);
dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv, dst_width,
dst_height, f);
c_time = (get_time() - c_time);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
@ -89,19 +93,16 @@ static int TestFilter(int src_width, int src_height,
for (i = 0; i < benchmark_iterations; ++i) {
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
src_u + (src_stride_uv * b) + b, src_stride_uv,
src_v + (src_stride_uv * b) + b, src_stride_uv,
src_width, src_height,
dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
src_v + (src_stride_uv * b) + b, src_stride_uv, src_width,
src_height, dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv,
dst_width, dst_height, f);
dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv, dst_width,
dst_height, f);
}
opt_time = (get_time() - opt_time) / benchmark_iterations;
// Report performance of C vs OPT
printf("filter %d - %8d us C - %8d us OPT\n",
f,
static_cast<int>(c_time * 1e6),
static_cast<int>(opt_time * 1e6));
printf("filter %d - %8d us C - %8d us OPT\n", f,
static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
// C version may be a little off from the optimized. Order of
// operations may introduce rounding somewhere. So do a difference
@ -133,25 +134,27 @@ static int TestFilter(int src_width, int src_height,
}
}
free_aligned_buffer_page_end(dst_y_c)
free_aligned_buffer_page_end(dst_u_c)
free_aligned_buffer_page_end(dst_v_c)
free_aligned_buffer_page_end(dst_y_opt)
free_aligned_buffer_page_end(dst_u_opt)
free_aligned_buffer_page_end(dst_v_opt)
free_aligned_buffer_page_end(dst_y_c) free_aligned_buffer_page_end(dst_u_c)
free_aligned_buffer_page_end(dst_v_c)
free_aligned_buffer_page_end(dst_y_opt)
free_aligned_buffer_page_end(dst_u_opt)
free_aligned_buffer_page_end(dst_v_opt)
free_aligned_buffer_page_end(src_y)
free_aligned_buffer_page_end(src_u)
free_aligned_buffer_page_end(src_v)
free_aligned_buffer_page_end(src_y)
free_aligned_buffer_page_end(src_u)
free_aligned_buffer_page_end(src_v)
return max_diff;
return max_diff;
}
// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
// 0 = exact.
static int TestFilter_16(int src_width, int src_height,
int dst_width, int dst_height,
FilterMode f, int benchmark_iterations) {
static int TestFilter_16(int src_width,
int src_height,
int dst_width,
int dst_height,
FilterMode f,
int benchmark_iterations) {
if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
return 0;
}
@ -161,20 +164,18 @@ static int TestFilter_16(int src_width, int src_height,
int src_width_uv = (Abs(src_width) + 1) >> 1;
int src_height_uv = (Abs(src_height) + 1) >> 1;
int64 src_y_plane_size = (Abs(src_width) + b * 2) *
(Abs(src_height) + b * 2);
int64 src_y_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2);
int64 src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
int src_stride_y = b * 2 + Abs(src_width);
int src_stride_uv = b * 2 + src_width_uv;
align_buffer_page_end(src_y, src_y_plane_size)
align_buffer_page_end(src_u, src_uv_plane_size)
align_buffer_page_end(src_v, src_uv_plane_size)
align_buffer_page_end(src_y_16, src_y_plane_size * 2)
align_buffer_page_end(src_u_16, src_uv_plane_size * 2)
align_buffer_page_end(src_v_16, src_uv_plane_size * 2)
uint16* p_src_y_16 = reinterpret_cast<uint16*>(src_y_16);
align_buffer_page_end(src_y, src_y_plane_size) align_buffer_page_end(
src_u, src_uv_plane_size) align_buffer_page_end(src_v, src_uv_plane_size)
align_buffer_page_end(src_y_16, src_y_plane_size * 2)
align_buffer_page_end(src_u_16, src_uv_plane_size * 2)
align_buffer_page_end(src_v_16, src_uv_plane_size * 2)
uint16* p_src_y_16 = reinterpret_cast<uint16*>(src_y_16);
uint16* p_src_u_16 = reinterpret_cast<uint16*>(src_u_16);
uint16* p_src_v_16 = reinterpret_cast<uint16*>(src_v_16);
@ -205,34 +206,33 @@ static int TestFilter_16(int src_width, int src_height,
int dst_stride_uv = b * 2 + dst_width_uv;
align_buffer_page_end(dst_y_8, dst_y_plane_size)
align_buffer_page_end(dst_u_8, dst_uv_plane_size)
align_buffer_page_end(dst_v_8, dst_uv_plane_size)
align_buffer_page_end(dst_y_16, dst_y_plane_size * 2)
align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2)
align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2)
align_buffer_page_end(dst_u_8, dst_uv_plane_size)
align_buffer_page_end(dst_v_8, dst_uv_plane_size)
align_buffer_page_end(dst_y_16, dst_y_plane_size * 2)
align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2)
align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2)
uint16* p_dst_y_16 = reinterpret_cast<uint16*>(dst_y_16);
uint16* p_dst_y_16 =
reinterpret_cast<uint16*>(dst_y_16);
uint16* p_dst_u_16 = reinterpret_cast<uint16*>(dst_u_16);
uint16* p_dst_v_16 = reinterpret_cast<uint16*>(dst_v_16);
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
src_u + (src_stride_uv * b) + b, src_stride_uv,
src_v + (src_stride_uv * b) + b, src_stride_uv,
src_width, src_height,
dst_y_8 + (dst_stride_y * b) + b, dst_stride_y,
src_v + (src_stride_uv * b) + b, src_stride_uv, src_width,
src_height, dst_y_8 + (dst_stride_y * b) + b, dst_stride_y,
dst_u_8 + (dst_stride_uv * b) + b, dst_stride_uv,
dst_v_8 + (dst_stride_uv * b) + b, dst_stride_uv,
dst_width, dst_height, f);
dst_v_8 + (dst_stride_uv * b) + b, dst_stride_uv, dst_width,
dst_height, f);
for (i = 0; i < benchmark_iterations; ++i) {
I420Scale_16(p_src_y_16 + (src_stride_y * b) + b, src_stride_y,
p_src_u_16 + (src_stride_uv * b) + b, src_stride_uv,
p_src_v_16 + (src_stride_uv * b) + b, src_stride_uv,
src_width, src_height,
p_dst_y_16 + (dst_stride_y * b) + b, dst_stride_y,
p_src_v_16 + (src_stride_uv * b) + b, src_stride_uv, src_width,
src_height, p_dst_y_16 + (dst_stride_y * b) + b, dst_stride_y,
p_dst_u_16 + (dst_stride_uv * b) + b, dst_stride_uv,
p_dst_v_16 + (dst_stride_uv * b) + b, dst_stride_uv,
dst_width, dst_height, f);
p_dst_v_16 + (dst_stride_uv * b) + b, dst_stride_uv, dst_width,
dst_height, f);
}
// Expect an exact match
@ -262,21 +262,20 @@ static int TestFilter_16(int src_width, int src_height,
}
}
free_aligned_buffer_page_end(dst_y_8)
free_aligned_buffer_page_end(dst_u_8)
free_aligned_buffer_page_end(dst_v_8)
free_aligned_buffer_page_end(dst_y_16)
free_aligned_buffer_page_end(dst_u_16)
free_aligned_buffer_page_end(dst_v_16)
free_aligned_buffer_page_end(dst_y_8) free_aligned_buffer_page_end(dst_u_8)
free_aligned_buffer_page_end(dst_v_8)
free_aligned_buffer_page_end(dst_y_16)
free_aligned_buffer_page_end(dst_u_16)
free_aligned_buffer_page_end(dst_v_16)
free_aligned_buffer_page_end(src_y)
free_aligned_buffer_page_end(src_u)
free_aligned_buffer_page_end(src_v)
free_aligned_buffer_page_end(src_y_16)
free_aligned_buffer_page_end(src_u_16)
free_aligned_buffer_page_end(src_v_16)
free_aligned_buffer_page_end(src_y)
free_aligned_buffer_page_end(src_u)
free_aligned_buffer_page_end(src_v)
free_aligned_buffer_page_end(src_y_16)
free_aligned_buffer_page_end(src_u_16)
free_aligned_buffer_page_end(src_v_16)
return max_diff;
return max_diff;
}
// The following adjustments in dimensions ensure the scale factor will be
@ -285,32 +284,30 @@ static int TestFilter_16(int src_width, int src_height,
#define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
#define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter) { \
int diff = TestFilter(SX(benchmark_width_, nom, denom), \
SX(benchmark_height_, nom, denom), \
DX(benchmark_width_, nom, denom), \
DX(benchmark_height_, nom, denom), \
kFilter##filter, benchmark_iterations_, \
disable_cpu_flags_, benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, DISABLED_ScaleDownBy##name##_##filter##_16) { \
int diff = TestFilter_16(SX(benchmark_width_, nom, denom), \
SX(benchmark_height_, nom, denom), \
DX(benchmark_width_, nom, denom), \
DX(benchmark_height_, nom, denom), \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
}
#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter) { \
int diff = TestFilter( \
SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, DISABLED_ScaleDownBy##name##_##filter##_16) { \
int diff = TestFilter_16( \
SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
}
// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
// filtering is different fixed point implementations for SSSE3, Neon and C.
#define TEST_FACTOR(name, nom, denom, boxdiff) \
TEST_FACTOR1(name, None, nom, denom, 0) \
TEST_FACTOR1(name, Linear, nom, denom, 3) \
TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
TEST_FACTOR1(name, Box, nom, denom, boxdiff)
#define TEST_FACTOR(name, nom, denom, boxdiff) \
TEST_FACTOR1(name, None, nom, denom, 0) \
TEST_FACTOR1(name, Linear, nom, denom, 3) \
TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
TEST_FACTOR1(name, Box, nom, denom, boxdiff)
TEST_FACTOR(2, 1, 2, 0)
TEST_FACTOR(4, 1, 4, 0)
@ -323,42 +320,40 @@ TEST_FACTOR(3, 1, 3, 0)
#undef SX
#undef DX
#define TEST_SCALETO1(name, width, height, filter, max_diff) \
TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
int diff = TestFilter(benchmark_width_, benchmark_height_, \
width, height, \
kFilter##filter, benchmark_iterations_, \
disable_cpu_flags_, benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \
int diff = TestFilter(width, height, \
Abs(benchmark_width_), Abs(benchmark_height_), \
kFilter##filter, benchmark_iterations_, \
disable_cpu_flags_, benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, \
DISABLED_##name##To##width##x##height##_##filter##_16) { \
int diff = TestFilter_16(benchmark_width_, benchmark_height_, \
width, height, \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, \
DISABLED_##name##From##width##x##height##_##filter##_16) { \
int diff = TestFilter_16(width, height, \
Abs(benchmark_width_), Abs(benchmark_height_), \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
}
#define TEST_SCALETO1(name, width, height, filter, max_diff) \
TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
int diff = TestFilter(benchmark_width_, benchmark_height_, width, height, \
kFilter##filter, benchmark_iterations_, \
disable_cpu_flags_, benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \
int diff = TestFilter(width, height, Abs(benchmark_width_), \
Abs(benchmark_height_), kFilter##filter, \
benchmark_iterations_, disable_cpu_flags_, \
benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, \
DISABLED_##name##To##width##x##height##_##filter##_16) { \
int diff = TestFilter_16(benchmark_width_, benchmark_height_, width, \
height, kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, \
DISABLED_##name##From##width##x##height##_##filter##_16) { \
int diff = TestFilter_16(width, height, Abs(benchmark_width_), \
Abs(benchmark_height_), kFilter##filter, \
benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
}
// Test scale to a specified size with all 4 filters.
#define TEST_SCALETO(name, width, height) \
TEST_SCALETO1(name, width, height, None, 0) \
TEST_SCALETO1(name, width, height, Linear, 0) \
TEST_SCALETO1(name, width, height, Bilinear, 0) \
TEST_SCALETO1(name, width, height, Box, 0)
#define TEST_SCALETO(name, width, height) \
TEST_SCALETO1(name, width, height, None, 0) \
TEST_SCALETO1(name, width, height, Linear, 0) \
TEST_SCALETO1(name, width, height, Bilinear, 0) \
TEST_SCALETO1(name, width, height, Box, 0)
TEST_SCALETO(Scale, 1, 1)
TEST_SCALETO(Scale, 320, 240)

View File

@ -25,18 +25,21 @@ unsigned int fastrand_seed = 0xfb;
DEFINE_int32(libyuv_width, 0, "width of test image.");
DEFINE_int32(libyuv_height, 0, "height of test image.");
DEFINE_int32(libyuv_repeat, 0, "number of times to repeat test.");
DEFINE_int32(libyuv_flags, 0,
"cpu flags for reference code. 1 = C, -1 = SIMD");
DEFINE_int32(libyuv_cpu_info, 0,
DEFINE_int32(libyuv_flags, 0, "cpu flags for reference code. 1 = C, -1 = SIMD");
DEFINE_int32(libyuv_cpu_info,
0,
"cpu flags for benchmark code. 1 = C, -1 = SIMD");
// For quicker unittests, default is 128 x 72. But when benchmarking,
// default to 720p. Allow size to specify.
// Set flags to -1 for benchmarking to avoid slower C code.
LibYUVConvertTest::LibYUVConvertTest() :
benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
LibYUVConvertTest::LibYUVConvertTest()
: benchmark_iterations_(BENCHMARK_ITERATIONS),
benchmark_width_(128),
benchmark_height_(72),
disable_cpu_flags_(1),
benchmark_cpu_info_(-1) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
@ -76,19 +79,26 @@ LibYUVConvertTest::LibYUVConvertTest() :
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
benchmark_pixels_div256_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
benchmark_pixels_div1280_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
benchmark_pixels_div256_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) +
255.0) /
256.0);
benchmark_pixels_div1280_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) +
1279.0) /
1280.0);
}
LibYUVColorTest::LibYUVColorTest() :
benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
LibYUVColorTest::LibYUVColorTest()
: benchmark_iterations_(BENCHMARK_ITERATIONS),
benchmark_width_(128),
benchmark_height_(72),
disable_cpu_flags_(1),
benchmark_cpu_info_(-1) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
@ -128,19 +138,26 @@ LibYUVColorTest::LibYUVColorTest() :
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
benchmark_pixels_div256_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
benchmark_pixels_div1280_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
benchmark_pixels_div256_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) +
255.0) /
256.0);
benchmark_pixels_div1280_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) +
1279.0) /
1280.0);
}
LibYUVScaleTest::LibYUVScaleTest() :
benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
LibYUVScaleTest::LibYUVScaleTest()
: benchmark_iterations_(BENCHMARK_ITERATIONS),
benchmark_width_(128),
benchmark_height_(72),
disable_cpu_flags_(1),
benchmark_cpu_info_(-1) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
@ -180,19 +197,26 @@ LibYUVScaleTest::LibYUVScaleTest() :
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
benchmark_pixels_div256_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
benchmark_pixels_div1280_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
benchmark_pixels_div256_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) +
255.0) /
256.0);
benchmark_pixels_div1280_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) +
1279.0) /
1280.0);
}
LibYUVRotateTest::LibYUVRotateTest() :
benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
LibYUVRotateTest::LibYUVRotateTest()
: benchmark_iterations_(BENCHMARK_ITERATIONS),
benchmark_width_(128),
benchmark_height_(72),
disable_cpu_flags_(1),
benchmark_cpu_info_(-1) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
@ -232,19 +256,26 @@ LibYUVRotateTest::LibYUVRotateTest() :
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
benchmark_pixels_div256_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
benchmark_pixels_div1280_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
benchmark_pixels_div256_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) +
255.0) /
256.0);
benchmark_pixels_div1280_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) +
1279.0) /
1280.0);
}
LibYUVPlanarTest::LibYUVPlanarTest() :
benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
LibYUVPlanarTest::LibYUVPlanarTest()
: benchmark_iterations_(BENCHMARK_ITERATIONS),
benchmark_width_(128),
benchmark_height_(72),
disable_cpu_flags_(1),
benchmark_cpu_info_(-1) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
@ -284,19 +315,26 @@ LibYUVPlanarTest::LibYUVPlanarTest() :
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
benchmark_pixels_div256_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
benchmark_pixels_div1280_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
benchmark_pixels_div256_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) +
255.0) /
256.0);
benchmark_pixels_div1280_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) +
1279.0) /
1280.0);
}
LibYUVBaseTest::LibYUVBaseTest() :
benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
LibYUVBaseTest::LibYUVBaseTest()
: benchmark_iterations_(BENCHMARK_ITERATIONS),
benchmark_width_(128),
benchmark_height_(72),
disable_cpu_flags_(1),
benchmark_cpu_info_(-1) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
@ -336,14 +374,18 @@ LibYUVBaseTest::LibYUVBaseTest() :
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
benchmark_pixels_div256_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
benchmark_pixels_div1280_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
benchmark_pixels_div256_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) +
255.0) /
256.0);
benchmark_pixels_div1280_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) +
1279.0) /
1280.0);
}
int main(int argc, char** argv) {

View File

@ -14,8 +14,8 @@
#ifdef WIN32
#include <windows.h>
#else
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/time.h>
#endif
#include <gtest/gtest.h>
@ -54,8 +54,10 @@ static __inline int Abs(int v) {
static const int kMaxWidth = 32768;
static const int kMaxHeight = 32768;
static inline bool SizeValid(int src_width, int src_height,
int dst_width, int dst_height) {
static inline bool SizeValid(int src_width,
int src_height,
int dst_width,
int dst_height) {
if (src_width > kMaxWidth || src_height > kMaxHeight ||
dst_width > kMaxWidth || dst_height > kMaxHeight) {
printf("Warning - size too large to test. Skipping\n");
@ -64,15 +66,16 @@ static inline bool SizeValid(int src_width, int src_height,
return true;
}
#define align_buffer_page_end(var, size) \
uint8* var; \
uint8* var##_mem; \
var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095 + 63) & ~4095)); \
var = (uint8*)((intptr_t)(var##_mem + (((size) + 4095 + 63) & ~4095) - \
(size)) & ~63);
#define align_buffer_page_end(var, size) \
uint8* var; \
uint8* var##_mem; \
var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095 + 63) & ~4095)); \
var = (uint8*)((intptr_t)(var##_mem + (((size) + 4095 + 63) & ~4095) - \
(size)) & \
~63);
#define free_aligned_buffer_page_end(var) \
free(var##_mem); \
free(var##_mem); \
var = 0;
#ifdef WIN32
@ -122,78 +125,78 @@ class LibYUVColorTest : public ::testing::Test {
protected:
LibYUVColorTest();
int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
};
class LibYUVConvertTest : public ::testing::Test {
protected:
LibYUVConvertTest();
int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
};
class LibYUVScaleTest : public ::testing::Test {
protected:
LibYUVScaleTest();
int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
};
class LibYUVRotateTest : public ::testing::Test {
protected:
LibYUVRotateTest();
int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
};
class LibYUVPlanarTest : public ::testing::Test {
protected:
LibYUVPlanarTest();
int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
};
class LibYUVBaseTest : public ::testing::Test {
protected:
LibYUVBaseTest();
int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
};
#endif // UNIT_TEST_UNIT_TEST_H_ NOLINT

View File

@ -11,26 +11,23 @@
#include <stdlib.h>
#include <string.h>
#include "libyuv/video_common.h"
#include "../unit_test/unit_test.h"
#include "libyuv/video_common.h"
namespace libyuv {
// Tests FourCC codes in video common, which are used for ConvertToI420().
static bool TestValidChar(uint32 onecc) {
if ((onecc >= '0' && onecc <= '9') ||
(onecc >= 'A' && onecc <= 'Z') ||
(onecc >= 'a' && onecc <= 'z') ||
(onecc == ' ') || (onecc == 0xff)) {
if ((onecc >= '0' && onecc <= '9') || (onecc >= 'A' && onecc <= 'Z') ||
(onecc >= 'a' && onecc <= 'z') || (onecc == ' ') || (onecc == 0xff)) {
return true;
}
return false;
}
static bool TestValidFourCC(uint32 fourcc, int bpp) {
if (!TestValidChar(fourcc & 0xff) ||
!TestValidChar((fourcc >> 8) & 0xff) ||
if (!TestValidChar(fourcc & 0xff) || !TestValidChar((fourcc >> 8) & 0xff) ||
!TestValidChar((fourcc >> 16) & 0xff) ||
!TestValidChar((fourcc >> 24) & 0xff)) {
return false;
@ -52,10 +49,10 @@ TEST_F(LibYUVBaseTest, TestCanonicalFourCC) {
EXPECT_EQ(static_cast<uint32>(FOURCC_UYVY), CanonicalFourCC(FOURCC_2VUY));
EXPECT_EQ(static_cast<uint32>(FOURCC_MJPG), CanonicalFourCC(FOURCC_JPEG));
EXPECT_EQ(static_cast<uint32>(FOURCC_MJPG), CanonicalFourCC(FOURCC_DMB1));
EXPECT_EQ(static_cast<uint32>(FOURCC_RAW), CanonicalFourCC(FOURCC_RGB3));
EXPECT_EQ(static_cast<uint32>(FOURCC_RAW), CanonicalFourCC(FOURCC_RGB3));
EXPECT_EQ(static_cast<uint32>(FOURCC_24BG), CanonicalFourCC(FOURCC_BGR3));
EXPECT_EQ(static_cast<uint32>(FOURCC_BGRA), CanonicalFourCC(FOURCC_CM32));
EXPECT_EQ(static_cast<uint32>(FOURCC_RAW), CanonicalFourCC(FOURCC_CM24));
EXPECT_EQ(static_cast<uint32>(FOURCC_RAW), CanonicalFourCC(FOURCC_CM24));
EXPECT_EQ(static_cast<uint32>(FOURCC_RGBO), CanonicalFourCC(FOURCC_L555));
EXPECT_EQ(static_cast<uint32>(FOURCC_RGBP), CanonicalFourCC(FOURCC_L565));
EXPECT_EQ(static_cast<uint32>(FOURCC_RGBO), CanonicalFourCC(FOURCC_5551));
@ -77,7 +74,7 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
EXPECT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA));
EXPECT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR));
EXPECT_TRUE(TestValidFourCC(FOURCC_24BG, FOURCC_BPP_24BG));
EXPECT_TRUE(TestValidFourCC(FOURCC_RAW, FOURCC_BPP_RAW));
EXPECT_TRUE(TestValidFourCC(FOURCC_RAW, FOURCC_BPP_RAW));
EXPECT_TRUE(TestValidFourCC(FOURCC_RGBA, FOURCC_BPP_RGBA));
EXPECT_TRUE(TestValidFourCC(FOURCC_RGBP, FOURCC_BPP_RGBP));
EXPECT_TRUE(TestValidFourCC(FOURCC_RGBO, FOURCC_BPP_RGBO));
@ -100,7 +97,7 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
EXPECT_TRUE(TestValidFourCC(FOURCC_RGB3, FOURCC_BPP_RGB3));
EXPECT_TRUE(TestValidFourCC(FOURCC_BGR3, FOURCC_BPP_BGR3));
EXPECT_TRUE(TestValidFourCC(FOURCC_H264, FOURCC_BPP_H264));
EXPECT_TRUE(TestValidFourCC(FOURCC_ANY, FOURCC_BPP_ANY));
EXPECT_TRUE(TestValidFourCC(FOURCC_ANY, FOURCC_BPP_ANY));
}
} // namespace libyuv

View File

@ -39,10 +39,12 @@ int main(int argc, char** argv) {
int amt2 = 0;
do {
amt1 = static_cast<int>(fread(buf1, 1, kBlockSize, fin1));
if (amt1 > 0) hash1 = libyuv::HashDjb2(buf1, amt1, hash1);
if (amt1 > 0)
hash1 = libyuv::HashDjb2(buf1, amt1, hash1);
if (fin2) {
amt2 = static_cast<int>(fread(buf2, 1, kBlockSize, fin2));
if (amt2 > 0) hash2 = libyuv::HashDjb2(buf2, amt2, hash2);
if (amt2 > 0)
hash2 = libyuv::HashDjb2(buf2, amt2, hash2);
int amt_min = (amt1 < amt2) ? amt1 : amt2;
size_min += amt_min;
sum_square_err += libyuv::ComputeSumSquareError(buf1, buf2, amt_min);
@ -52,8 +54,8 @@ int main(int argc, char** argv) {
printf("hash1 %x", hash1);
if (fin2) {
printf(", hash2 %x", hash2);
double mse = static_cast<double>(sum_square_err) /
static_cast<double>(size_min);
double mse =
static_cast<double>(sum_square_err) / static_cast<double>(size_min);
printf(", mse %.2f", mse);
double psnr = libyuv::SumSquareErrorToPsnr(sum_square_err, size_min);
printf(", psnr %.2f\n", psnr);

View File

@ -29,13 +29,13 @@ bool verbose = false;
bool attenuate = false;
bool unattenuate = false;
int image_width = 0, image_height = 0; // original width and height
int dst_width = 0, dst_height = 0; // new width and height
int dst_width = 0, dst_height = 0; // new width and height
int fileindex_org = 0; // argv argument contains the original file name.
int fileindex_rec = 0; // argv argument contains the reconstructed file name.
int num_rec = 0; // Number of reconstructed images.
int num_skip_org = 0; // Number of frames to skip in original.
int num_frames = 0; // Number of frames to convert.
int filter = 1; // Bilinear filter for scaling.
int num_rec = 0; // Number of reconstructed images.
int num_skip_org = 0; // Number of frames to skip in original.
int num_frames = 0; // Number of frames to convert.
int filter = 1; // Bilinear filter for scaling.
static __inline uint32 Abs(int32 v) {
return v >= 0 ? v : -v;
@ -48,8 +48,8 @@ bool ExtractResolutionFromFilename(const char* name,
// Isolate the .width_height. section of the filename by searching for a
// dot or underscore followed by a digit.
for (int i = 0; name[i]; ++i) {
if ((name[i] == '.' || name[i] == '_') &&
name[i + 1] >= '0' && name[i + 1] <= '9') {
if ((name[i] == '.' || name[i] == '_') && name[i + 1] >= '0' &&
name[i + 1] <= '9') {
int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr); // NOLINT
if (2 == n) {
return true;
@ -59,13 +59,14 @@ bool ExtractResolutionFromFilename(const char* name,
return false;
}
void PrintHelp(const char * program) {
void PrintHelp(const char* program) {
printf("%s [-options] src_argb.raw dst_yuv.raw\n", program);
printf(" -s <width> <height> .... specify source resolution. "
"Optional if name contains\n"
" resolution (ie. "
"name.1920x800_24Hz_P420.yuv)\n"
" Negative value mirrors.\n");
printf(
" -s <width> <height> .... specify source resolution. "
"Optional if name contains\n"
" resolution (ie. "
"name.1920x800_24Hz_P420.yuv)\n"
" Negative value mirrors.\n");
printf(" -d <width> <height> .... specify destination resolution.\n");
printf(" -f <filter> ............ 0 = point, 1 = bilinear (default).\n");
printf(" -skip <src_argb> ....... Number of frame to skip of src_argb\n");
@ -78,7 +79,8 @@ void PrintHelp(const char * program) {
}
void ParseOptions(int argc, const char* argv[]) {
if (argc <= 1) PrintHelp(argv[0]);
if (argc <= 1)
PrintHelp(argv[0]);
for (int c = 1; c < argc; ++c) {
if (!strcmp(argv[c], "-v")) {
verbose = true;
@ -89,17 +91,17 @@ void ParseOptions(int argc, const char* argv[]) {
} else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
PrintHelp(argv[0]);
} else if (!strcmp(argv[c], "-s") && c + 2 < argc) {
image_width = atoi(argv[++c]); // NOLINT
image_height = atoi(argv[++c]); // NOLINT
image_width = atoi(argv[++c]); // NOLINT
image_height = atoi(argv[++c]); // NOLINT
} else if (!strcmp(argv[c], "-d") && c + 2 < argc) {
dst_width = atoi(argv[++c]); // NOLINT
dst_height = atoi(argv[++c]); // NOLINT
dst_width = atoi(argv[++c]); // NOLINT
dst_height = atoi(argv[++c]); // NOLINT
} else if (!strcmp(argv[c], "-skip") && c + 1 < argc) {
num_skip_org = atoi(argv[++c]); // NOLINT
num_skip_org = atoi(argv[++c]); // NOLINT
} else if (!strcmp(argv[c], "-frames") && c + 1 < argc) {
num_frames = atoi(argv[++c]); // NOLINT
num_frames = atoi(argv[++c]); // NOLINT
} else if (!strcmp(argv[c], "-f") && c + 1 < argc) {
filter = atoi(argv[++c]); // NOLINT
filter = atoi(argv[++c]); // NOLINT
} else if (argv[c][0] == '-') {
fprintf(stderr, "Unknown option. %s\n", argv[c]);
} else if (fileindex_org == 0) {
@ -127,11 +129,9 @@ void ParseOptions(int argc, const char* argv[]) {
int org_width, org_height;
int rec_width, rec_height;
bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org],
&org_width,
&org_height);
&org_width, &org_height);
bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec],
&rec_width,
&rec_height);
&rec_width, &rec_height);
if (image_width == 0 || image_height == 0) {
if (org_res_avail) {
image_width = org_width;
@ -158,10 +158,14 @@ void ParseOptions(int argc, const char* argv[]) {
static const int kTileX = 32;
static const int kTileY = 32;
static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
int dst_width, int dst_height,
static int TileARGBScale(const uint8* src_argb,
int src_stride_argb,
int src_width,
int src_height,
uint8* dst_argb,
int dst_stride_argb,
int dst_width,
int dst_height,
libyuv::FilterMode filtering) {
for (int y = 0; y < dst_height; y += kTileY) {
for (int x = 0; x < dst_width; x += kTileX) {
@ -173,11 +177,10 @@ static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
if (y + clip_height > dst_height) {
clip_height = dst_height - y;
}
int r = libyuv::ARGBScaleClip(src_argb, src_stride_argb,
src_width, src_height,
dst_argb, dst_stride_argb,
dst_width, dst_height,
x, y, clip_width, clip_height, filtering);
int r = libyuv::ARGBScaleClip(src_argb, src_stride_argb, src_width,
src_height, dst_argb, dst_stride_argb,
dst_width, dst_height, x, y, clip_width,
clip_height, filtering);
if (r) {
return r;
}
@ -197,8 +200,8 @@ int main(int argc, const char* argv[]) {
}
// Open all files to convert to
FILE** file_rec = new FILE* [num_rec];
memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT
FILE** file_rec = new FILE*[num_rec];
memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT
for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "wb");
if (file_rec[cur_rec] == NULL) {
@ -222,8 +225,8 @@ int main(int argc, const char* argv[]) {
// Input is YUV
if (org_is_yuv) {
const int y_size = Abs(image_width) * Abs(image_height);
const int uv_size = ((Abs(image_width) + 1) / 2) *
((Abs(image_height) + 1) / 2);
const int uv_size =
((Abs(image_width) + 1) / 2) * ((Abs(image_height) + 1) / 2);
org_size = y_size + 2 * uv_size; // YUV original.
}
@ -233,8 +236,8 @@ int main(int argc, const char* argv[]) {
const size_t total_size = y_size + 2 * uv_size;
#if defined(_MSC_VER)
_fseeki64(file_org,
static_cast<__int64>(num_skip_org) *
static_cast<__int64>(org_size), SEEK_SET);
static_cast<__int64>(num_skip_org) * static_cast<__int64>(org_size),
SEEK_SET);
#else
fseek(file_org, num_skip_org * total_size, SEEK_SET);
#endif
@ -256,18 +259,18 @@ int main(int argc, const char* argv[]) {
}
if (verbose) {
printf("Size: %dx%d to %dx%d\n", image_width, image_height,
dst_width, dst_height);
printf("Size: %dx%d to %dx%d\n", image_width, image_height, dst_width,
dst_height);
}
int number_of_frames;
for (number_of_frames = 0; ; ++number_of_frames) {
for (number_of_frames = 0;; ++number_of_frames) {
if (num_frames && number_of_frames >= num_frames)
break;
// Load original YUV or ARGB frame.
size_t bytes_org = fread(ch_org, sizeof(uint8),
static_cast<size_t>(org_size), file_org);
size_t bytes_org =
fread(ch_org, sizeof(uint8), static_cast<size_t>(org_size), file_org);
if (bytes_org < static_cast<size_t>(org_size))
break;
@ -290,22 +293,17 @@ int main(int argc, const char* argv[]) {
int half_src_height = (src_height + 1) / 2;
int half_dst_width = (dst_width + 1) / 2;
int half_dst_height = (dst_height + 1) / 2;
I420Scale(ch_org, src_width,
ch_org + src_width * src_height, half_src_width,
ch_org + src_width * src_height +
half_src_width * half_src_height, half_src_width,
image_width, image_height,
ch_rec, dst_width,
ch_rec + dst_width * dst_height, half_dst_width,
ch_rec + dst_width * dst_height +
half_dst_width * half_dst_height, half_dst_width,
dst_width, dst_height,
static_cast<libyuv::FilterMode>(filter));
I420Scale(
ch_org, src_width, ch_org + src_width * src_height, half_src_width,
ch_org + src_width * src_height + half_src_width * half_src_height,
half_src_width, image_width, image_height, ch_rec, dst_width,
ch_rec + dst_width * dst_height, half_dst_width,
ch_rec + dst_width * dst_height + half_dst_width * half_dst_height,
half_dst_width, dst_width, dst_height,
static_cast<libyuv::FilterMode>(filter));
} else {
TileARGBScale(ch_org, Abs(image_width) * 4,
image_width, image_height,
ch_dst, dst_width * 4,
dst_width, dst_height,
TileARGBScale(ch_org, Abs(image_width) * 4, image_width, image_height,
ch_dst, dst_width * 4, dst_width, dst_height,
static_cast<libyuv::FilterMode>(filter));
}
bool rec_is_yuv = strstr(argv[fileindex_rec + cur_rec], "_P420.") != NULL;
@ -321,25 +319,24 @@ int main(int argc, const char* argv[]) {
if (!org_is_yuv && rec_is_yuv) {
int half_width = (dst_width + 1) / 2;
int half_height = (dst_height + 1) / 2;
libyuv::ARGBToI420(ch_dst, dst_width * 4,
ch_rec, dst_width,
ch_rec + dst_width * dst_height, half_width,
ch_rec + dst_width * dst_height +
half_width * half_height, half_width,
dst_width, dst_height);
libyuv::ARGBToI420(
ch_dst, dst_width * 4, ch_rec, dst_width,
ch_rec + dst_width * dst_height, half_width,
ch_rec + dst_width * dst_height + half_width * half_height,
half_width, dst_width, dst_height);
}
// Output YUV or ARGB frame.
if (rec_is_yuv) {
size_t bytes_rec = fwrite(ch_rec, sizeof(uint8),
static_cast<size_t>(total_size),
file_rec[cur_rec]);
size_t bytes_rec =
fwrite(ch_rec, sizeof(uint8), static_cast<size_t>(total_size),
file_rec[cur_rec]);
if (bytes_rec < static_cast<size_t>(total_size))
break;
} else {
size_t bytes_rec = fwrite(ch_dst, sizeof(uint8),
static_cast<size_t>(dst_size),
file_rec[cur_rec]);
size_t bytes_rec =
fwrite(ch_dst, sizeof(uint8), static_cast<size_t>(dst_size),
file_rec[cur_rec]);
if (bytes_rec < static_cast<size_t>(dst_size))
break;
}

View File

@ -27,7 +27,7 @@ typedef unsigned __int64 uint64;
#else // COMPILER_MSVC
#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
typedef unsigned long uint64; // NOLINT
#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
typedef unsigned long long uint64; // NOLINT
#endif // __LP64__
#endif // _MSC_VER
@ -39,85 +39,81 @@ typedef unsigned long long uint64; // NOLINT
!defined(__aarch64__)
#define HAS_SUMSQUAREERROR_NEON
static uint32 SumSquareError_NEON(const uint8* src_a,
const uint8* src_b, int count) {
const uint8* src_b,
int count) {
volatile uint32 sse;
asm volatile (
"vmov.u8 q7, #0 \n"
"vmov.u8 q9, #0 \n"
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
asm volatile(
"vmov.u8 q7, #0 \n"
"vmov.u8 q9, #0 \n"
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
"1: \n"
"vld1.u8 {q0}, [%0]! \n"
"vld1.u8 {q1}, [%1]! \n"
"vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n"
"vmlal.s16 q7, d4, d4 \n"
"vmlal.s16 q8, d6, d6 \n"
"vmlal.s16 q8, d5, d5 \n"
"vmlal.s16 q10, d7, d7 \n"
"subs %2, %2, #16 \n"
"bhi 1b \n"
"1: \n"
"vld1.u8 {q0}, [%0]! \n"
"vld1.u8 {q1}, [%1]! \n"
"vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n"
"vmlal.s16 q7, d4, d4 \n"
"vmlal.s16 q8, d6, d6 \n"
"vmlal.s16 q8, d5, d5 \n"
"vmlal.s16 q10, d7, d7 \n"
"subs %2, %2, #16 \n"
"bhi 1b \n"
"vadd.u32 q7, q7, q8 \n"
"vadd.u32 q9, q9, q10 \n"
"vadd.u32 q10, q7, q9 \n"
"vpaddl.u32 q1, q10 \n"
"vadd.u64 d0, d2, d3 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
"vadd.u32 q7, q7, q8 \n"
"vadd.u32 q9, q9, q10 \n"
"vadd.u32 q10, q7, q9 \n"
"vpaddl.u32 q1, q10 \n"
"vadd.u64 d0, d2, d3 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
return sse;
}
#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_SUMSQUAREERROR_NEON
static uint32 SumSquareError_NEON(const uint8* src_a,
const uint8* src_b, int count) {
const uint8* src_b,
int count) {
volatile uint32 sse;
asm volatile (
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
asm volatile(
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n"
"ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n"
"ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n"
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
}
#elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#define HAS_SUMSQUAREERROR_SSE2
__declspec(naked)
static uint32 SumSquareError_SSE2(const uint8* /*src_a*/,
const uint8* /*src_b*/, int /*count*/) {
__declspec(naked) static uint32 SumSquareError_SSE2(const uint8* /*src_a*/,
const uint8* /*src_b*/,
int /*count*/) {
__asm {
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
mov ecx, [esp + 12] // count
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
mov ecx, [esp + 12] // count
pxor xmm0, xmm0
pxor xmm5, xmm5
sub edx, eax
@ -151,47 +147,49 @@ static uint32 SumSquareError_SSE2(const uint8* /*src_a*/,
#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
#define HAS_SUMSQUAREERROR_SSE2
static uint32 SumSquareError_SSE2(const uint8* src_a,
const uint8* src_b, int count) {
const uint8* src_b,
int count) {
uint32 sse;
asm volatile ( // NOLINT
"pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm5,%%xmm5 \n"
"sub %0,%1 \n"
asm volatile( // NOLINT
"pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm5,%%xmm5 \n"
"sub %0,%1 \n"
"1: \n"
"movdqu (%0),%%xmm1 \n"
"movdqu (%0,%1,1),%%xmm2 \n"
"lea 0x10(%0),%0 \n"
"movdqu %%xmm1,%%xmm3 \n"
"psubusb %%xmm2,%%xmm1 \n"
"psubusb %%xmm3,%%xmm2 \n"
"por %%xmm2,%%xmm1 \n"
"movdqu %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm1 \n"
"punpckhbw %%xmm5,%%xmm2 \n"
"pmaddwd %%xmm1,%%xmm1 \n"
"pmaddwd %%xmm2,%%xmm2 \n"
"paddd %%xmm1,%%xmm0 \n"
"paddd %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
"1: \n"
"movdqu (%0),%%xmm1 \n"
"movdqu (%0,%1,1),%%xmm2 \n"
"lea 0x10(%0),%0 \n"
"movdqu %%xmm1,%%xmm3 \n"
"psubusb %%xmm2,%%xmm1 \n"
"psubusb %%xmm3,%%xmm2 \n"
"por %%xmm2,%%xmm1 \n"
"movdqu %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm1 \n"
"punpckhbw %%xmm5,%%xmm2 \n"
"pmaddwd %%xmm1,%%xmm1 \n"
"pmaddwd %%xmm2,%%xmm2 \n"
"paddd %%xmm1,%%xmm0 \n"
"paddd %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
"pshufd $0xee,%%xmm0,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n"
"pshufd $0x1,%%xmm0,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n"
"movd %%xmm0,%3 \n"
"pshufd $0xee,%%xmm0,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n"
"pshufd $0x1,%%xmm0,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n"
"movd %%xmm0,%3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
"=g"(sse) // %3
:
: "memory", "cc"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
"=g"(sse) // %3
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
,
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
); // NOLINT
); // NOLINT
return sse;
}
#endif // LIBYUV_DISABLE_X86 etc
@ -199,20 +197,22 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
#if defined(HAS_SUMSQUAREERROR_SSE2)
#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
static __inline void __cpuid(int cpu_info[4], int info_type) {
asm volatile ( // NOLINT
"mov %%ebx, %%edi \n"
"cpuid \n"
"xchg %%edi, %%ebx \n"
: "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type));
asm volatile( // NOLINT
"mov %%ebx, %%edi \n"
"cpuid \n"
"xchg %%edi, %%ebx \n"
: "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]),
"=d"(cpu_info[3])
: "a"(info_type));
}
// For gcc/clang but not clangcl.
#elif (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)
#elif !defined(_MSC_VER) && (defined(__i386__) || defined(__x86_64__))
static __inline void __cpuid(int cpu_info[4], int info_type) {
asm volatile ( // NOLINT
"cpuid \n"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type));
asm volatile( // NOLINT
"cpuid \n"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]),
"=d"(cpu_info[3])
: "a"(info_type));
}
#endif
@ -229,7 +229,8 @@ static int CpuHasSSE2() {
#endif // HAS_SUMSQUAREERROR_SSE2
static uint32 SumSquareError_C(const uint8* src_a,
const uint8* src_b, int count) {
const uint8* src_b,
int count) {
uint32 sse = 0u;
for (int x = 0; x < count; ++x) {
int diff = src_a[x] - src_b[x];
@ -239,9 +240,10 @@ static uint32 SumSquareError_C(const uint8* src_a,
}
double ComputeSumSquareError(const uint8* src_a,
const uint8* src_b, int count) {
uint32 (*SumSquareError)(const uint8* src_a,
const uint8* src_b, int count) = SumSquareError_C;
const uint8* src_b,
int count) {
uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
SumSquareError_C;
#if defined(HAS_SUMSQUAREERROR_NEON)
SumSquareError = SumSquareError_NEON;
#endif
@ -253,7 +255,7 @@ double ComputeSumSquareError(const uint8* src_a,
const int kBlockSize = 1 << 15;
uint64 sse = 0;
#ifdef _OPENMP
#pragma omp parallel for reduction(+: sse)
#pragma omp parallel for reduction(+ : sse)
#endif
for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
sse += SumSquareError(src_a + i, src_b + i, kBlockSize);

View File

@ -71,8 +71,8 @@ bool ExtractResolutionFromFilename(const char* name,
// Isolate the .width_height. section of the filename by searching for a
// dot or underscore followed by a digit.
for (int i = 0; name[i]; ++i) {
if ((name[i] == '.' || name[i] == '_') &&
name[i + 1] >= '0' && name[i + 1] <= '9') {
if ((name[i] == '.' || name[i] == '_') && name[i + 1] >= '0' &&
name[i + 1] <= '9') {
int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr); // NOLINT
if (2 == n) {
return true;
@ -88,7 +88,7 @@ bool ExtractResolutionFromFilename(const char* name,
return false;
}
fseek(file_org, 0, SEEK_END);
size_t total_size = ftell(file_org);
size_t total_size = ftell(file_org);
fseek(file_org, 0, SEEK_SET);
uint8* const ch_org = new uint8[total_size];
memset(ch_org, 0, total_size);
@ -109,8 +109,10 @@ bool ExtractResolutionFromFilename(const char* name,
// This can be useful when comparing codecs that are inconsistant about Y
uint8 ScaleY(uint8 y) {
int ny = (y - 16) * 256 / 224;
if (ny < 0) ny = 0;
if (ny > 255) ny = 255;
if (ny < 0)
ny = 0;
if (ny > 255)
ny = 255;
return static_cast<uint8>(ny);
}
@ -119,16 +121,18 @@ double GetMSE(double sse, double size) {
return sse / size;
}
void PrintHelp(const char * program) {
void PrintHelp(const char* program) {
printf("%s [-options] org_seq rec_seq [rec_seq2.. etc]\n", program);
#ifdef HAVE_JPEG
printf("jpeg or raw YUV 420 supported.\n");
#endif
printf("options:\n");
printf(" -s <width> <height> .... specify YUV size, mandatory if none of the "
"sequences have the\n");
printf(" resolution embedded in their filename (ie. "
"name.1920x800_24Hz_P420.yuv)\n");
printf(
" -s <width> <height> .... specify YUV size, mandatory if none of the "
"sequences have the\n");
printf(
" resolution embedded in their filename (ie. "
"name.1920x800_24Hz_P420.yuv)\n");
printf(" -psnr .................. compute PSNR (default)\n");
printf(" -ssim .................. compute SSIM\n");
printf(" -mse ................... compute MSE\n");
@ -146,7 +150,8 @@ void PrintHelp(const char * program) {
}
void ParseOptions(int argc, const char* argv[]) {
if (argc <= 1) PrintHelp(argv[0]);
if (argc <= 1)
PrintHelp(argv[0]);
for (int c = 1; c < argc; ++c) {
if (!strcmp(argv[c], "-v")) {
verbose = true;
@ -168,16 +173,16 @@ void ParseOptions(int argc, const char* argv[]) {
} else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
PrintHelp(argv[0]);
} else if (!strcmp(argv[c], "-s") && c + 2 < argc) {
image_width = atoi(argv[++c]); // NOLINT
image_height = atoi(argv[++c]); // NOLINT
image_width = atoi(argv[++c]); // NOLINT
image_height = atoi(argv[++c]); // NOLINT
} else if (!strcmp(argv[c], "-skip") && c + 2 < argc) {
num_skip_org = atoi(argv[++c]); // NOLINT
num_skip_rec = atoi(argv[++c]); // NOLINT
num_skip_org = atoi(argv[++c]); // NOLINT
num_skip_rec = atoi(argv[++c]); // NOLINT
} else if (!strcmp(argv[c], "-frames") && c + 1 < argc) {
num_frames = atoi(argv[++c]); // NOLINT
num_frames = atoi(argv[++c]); // NOLINT
#ifdef _OPENMP
} else if (!strcmp(argv[c], "-t") && c + 1 < argc) {
num_threads = atoi(argv[++c]); // NOLINT
num_threads = atoi(argv[++c]); // NOLINT
#endif
} else if (argv[c][0] == '-') {
fprintf(stderr, "Unknown option. %s\n", argv[c]);
@ -206,11 +211,9 @@ void ParseOptions(int argc, const char* argv[]) {
int org_width, org_height;
int rec_width, rec_height;
bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org],
&org_width,
&org_height);
&org_width, &org_height);
bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec],
&rec_width,
&rec_height);
&rec_width, &rec_height);
if (org_res_avail) {
if (rec_res_avail) {
if ((org_width == rec_width) && (org_height == rec_height)) {
@ -234,11 +237,15 @@ void ParseOptions(int argc, const char* argv[]) {
}
}
bool UpdateMetrics(uint8* ch_org, uint8* ch_rec,
const int y_size, const int uv_size, const size_t total_size,
bool UpdateMetrics(uint8* ch_org,
uint8* ch_rec,
const int y_size,
const int uv_size,
const size_t total_size,
int number_of_frames,
metric* cur_distortion_psnr,
metric* distorted_frame, bool do_psnr) {
metric* distorted_frame,
bool do_psnr) {
const int uv_offset = (do_swap_uv ? uv_size : 0);
const uint8* const u_org = ch_org + y_size + uv_offset;
const uint8* const u_rec = ch_rec + y_size;
@ -247,11 +254,11 @@ bool UpdateMetrics(uint8* ch_org, uint8* ch_rec,
if (do_psnr) {
#ifdef HAVE_JPEG
double y_err = static_cast<double>(
libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size));
libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size));
double u_err = static_cast<double>(
libyuv::ComputeSumSquareError(u_org, u_rec, uv_size));
libyuv::ComputeSumSquareError(u_org, u_rec, uv_size));
double v_err = static_cast<double>(
libyuv::ComputeSumSquareError(v_org, v_rec, uv_size));
libyuv::ComputeSumSquareError(v_org, v_rec, uv_size));
#else
double y_err = ComputeSumSquareError(ch_org, ch_rec, y_size);
double u_err = ComputeSumSquareError(u_org, u_rec, uv_size);
@ -265,17 +272,17 @@ bool UpdateMetrics(uint8* ch_org, uint8* ch_rec,
distorted_frame->y = ComputePSNR(y_err, static_cast<double>(y_size));
distorted_frame->u = ComputePSNR(u_err, static_cast<double>(uv_size));
distorted_frame->v = ComputePSNR(v_err, static_cast<double>(uv_size));
distorted_frame->all = ComputePSNR(total_err,
static_cast<double>(total_size));
distorted_frame->all =
ComputePSNR(total_err, static_cast<double>(total_size));
} else {
distorted_frame->y = CalcSSIM(ch_org, ch_rec, image_width, image_height);
distorted_frame->u = CalcSSIM(u_org, u_rec, (image_width + 1) / 2,
(image_height + 1) / 2);
distorted_frame->v = CalcSSIM(v_org, v_rec, (image_width + 1) / 2,
(image_height + 1) / 2);
distorted_frame->u =
CalcSSIM(u_org, u_rec, (image_width + 1) / 2, (image_height + 1) / 2);
distorted_frame->v =
CalcSSIM(v_org, v_rec, (image_width + 1) / 2, (image_height + 1) / 2);
distorted_frame->all =
(distorted_frame->y + distorted_frame->u + distorted_frame->v)
/ total_size;
(distorted_frame->y + distorted_frame->u + distorted_frame->v) /
total_size;
distorted_frame->y /= y_size;
distorted_frame->u /= uv_size;
distorted_frame->v /= uv_size;
@ -330,8 +337,8 @@ int main(int argc, const char* argv[]) {
}
// Open all files to compare to
FILE** file_rec = new FILE* [num_rec];
memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT
FILE** file_rec = new FILE*[num_rec];
memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT
for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "rb");
if (file_rec[cur_rec] == NULL) {
@ -347,19 +354,18 @@ int main(int argc, const char* argv[]) {
const int y_size = image_width * image_height;
const int uv_size = ((image_width + 1) / 2) * ((image_height + 1) / 2);
const size_t total_size = y_size + 2 * uv_size; // NOLINT
const size_t total_size = y_size + 2 * uv_size; // NOLINT
#if defined(_MSC_VER)
_fseeki64(file_org,
static_cast<__int64>(num_skip_org) *
static_cast<__int64>(total_size), SEEK_SET);
_fseeki64(file_org, static_cast<__int64>(num_skip_org) *
static_cast<__int64>(total_size),
SEEK_SET);
#else
fseek(file_org, num_skip_org * total_size, SEEK_SET);
#endif
for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
#if defined(_MSC_VER)
_fseeki64(file_rec[cur_rec],
static_cast<__int64>(num_skip_rec) *
static_cast<__int64>(total_size),
_fseeki64(file_rec[cur_rec], static_cast<__int64>(num_skip_rec) *
static_cast<__int64>(total_size),
SEEK_SET);
#else
fseek(file_rec[cur_rec], num_skip_rec * total_size, SEEK_SET);
@ -420,7 +426,7 @@ int main(int argc, const char* argv[]) {
}
int number_of_frames;
for (number_of_frames = 0; ; ++number_of_frames) {
for (number_of_frames = 0;; ++number_of_frames) {
if (num_frames && number_of_frames >= num_frames)
break;
@ -432,17 +438,11 @@ int main(int argc, const char* argv[]) {
memcpy(ch_jpeg, ch_org, bytes_org);
memset(ch_org, 0, total_size);
if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_org,
ch_org,
image_width,
ch_org + y_size,
(image_width + 1) / 2,
if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_org, ch_org, image_width,
ch_org + y_size, (image_width + 1) / 2,
ch_org + y_size + uv_size,
(image_width + 1) / 2,
image_width,
image_height,
image_width,
image_height)) {
(image_width + 1) / 2, image_width,
image_height, image_width, image_height)) {
delete[] ch_jpeg;
break;
}
@ -453,8 +453,8 @@ int main(int argc, const char* argv[]) {
}
for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
size_t bytes_rec = fread(ch_rec, sizeof(uint8),
total_size, file_rec[cur_rec]);
size_t bytes_rec =
fread(ch_rec, sizeof(uint8), total_size, file_rec[cur_rec]);
if (bytes_rec < total_size) {
#ifdef HAVE_JPEG
// Try parsing file as a jpeg.
@ -462,17 +462,11 @@ int main(int argc, const char* argv[]) {
memcpy(ch_jpeg, ch_rec, bytes_rec);
memset(ch_rec, 0, total_size);
if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_rec,
ch_rec,
image_width,
ch_rec + y_size,
(image_width + 1) / 2,
if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_rec, ch_rec, image_width,
ch_rec + y_size, (image_width + 1) / 2,
ch_rec + y_size + uv_size,
(image_width + 1) / 2,
image_width,
image_height,
image_width,
image_height)) {
(image_width + 1) / 2, image_width,
image_height, image_width, image_height)) {
delete[] ch_jpeg;
break;
}
@ -488,10 +482,8 @@ int main(int argc, const char* argv[]) {
if (do_psnr) {
metric distorted_frame;
metric* cur_distortion_psnr = &distortion_psnr[cur_rec];
bool ismin = UpdateMetrics(ch_org, ch_rec,
y_size, uv_size, total_size,
number_of_frames,
cur_distortion_psnr,
bool ismin = UpdateMetrics(ch_org, ch_rec, y_size, uv_size, total_size,
number_of_frames, cur_distortion_psnr,
&distorted_frame, true);
if (verbose) {
printf("\t%10.6f", distorted_frame.y);
@ -504,10 +496,8 @@ int main(int argc, const char* argv[]) {
if (do_ssim) {
metric distorted_frame;
metric* cur_distortion_ssim = &distortion_ssim[cur_rec];
bool ismin = UpdateMetrics(ch_org, ch_rec,
y_size, uv_size, total_size,
number_of_frames,
cur_distortion_ssim,
bool ismin = UpdateMetrics(ch_org, ch_rec, y_size, uv_size, total_size,
number_of_frames, cur_distortion_ssim,
&distorted_frame, false);
if (verbose) {
printf("\t%10.6f", distorted_frame.y);
@ -543,24 +533,20 @@ int main(int argc, const char* argv[]) {
}
if (do_psnr) {
const double global_psnr_y = ComputePSNR(
cur_distortion_psnr->global_y,
static_cast<double>(y_size) * number_of_frames);
const double global_psnr_u = ComputePSNR(
cur_distortion_psnr->global_u,
static_cast<double>(uv_size) * number_of_frames);
const double global_psnr_v = ComputePSNR(
cur_distortion_psnr->global_v,
static_cast<double>(uv_size) * number_of_frames);
const double global_psnr_all = ComputePSNR(
cur_distortion_psnr->global_all,
static_cast<double>(total_size) * number_of_frames);
printf("Global:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
global_psnr_y,
global_psnr_u,
global_psnr_v,
global_psnr_all,
number_of_frames);
const double global_psnr_y =
ComputePSNR(cur_distortion_psnr->global_y,
static_cast<double>(y_size) * number_of_frames);
const double global_psnr_u =
ComputePSNR(cur_distortion_psnr->global_u,
static_cast<double>(uv_size) * number_of_frames);
const double global_psnr_v =
ComputePSNR(cur_distortion_psnr->global_v,
static_cast<double>(uv_size) * number_of_frames);
const double global_psnr_all =
ComputePSNR(cur_distortion_psnr->global_all,
static_cast<double>(total_size) * number_of_frames);
printf("Global:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", global_psnr_y,
global_psnr_u, global_psnr_v, global_psnr_all, number_of_frames);
if (show_name) {
printf("\t%s", argv[fileindex_rec + cur_rec]);
}
@ -570,20 +556,14 @@ int main(int argc, const char* argv[]) {
if (!quiet) {
printf("Avg:");
if (do_psnr) {
printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
cur_distortion_psnr->y,
cur_distortion_psnr->u,
cur_distortion_psnr->v,
cur_distortion_psnr->all,
number_of_frames);
printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", cur_distortion_psnr->y,
cur_distortion_psnr->u, cur_distortion_psnr->v,
cur_distortion_psnr->all, number_of_frames);
}
if (do_ssim) {
printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
cur_distortion_ssim->y,
cur_distortion_ssim->u,
cur_distortion_ssim->v,
cur_distortion_ssim->all,
number_of_frames);
printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", cur_distortion_ssim->y,
cur_distortion_ssim->u, cur_distortion_ssim->v,
cur_distortion_ssim->all, number_of_frames);
}
if (show_name) {
printf("\t%s", argv[fileindex_rec + cur_rec]);
@ -594,19 +574,15 @@ int main(int argc, const char* argv[]) {
printf("Min:");
if (do_psnr) {
printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
cur_distortion_psnr->min_y,
cur_distortion_psnr->min_u,
cur_distortion_psnr->min_v,
cur_distortion_psnr->min_all,
cur_distortion_psnr->min_frame);
cur_distortion_psnr->min_y, cur_distortion_psnr->min_u,
cur_distortion_psnr->min_v, cur_distortion_psnr->min_all,
cur_distortion_psnr->min_frame);
}
if (do_ssim) {
printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
cur_distortion_ssim->min_y,
cur_distortion_ssim->min_u,
cur_distortion_ssim->min_v,
cur_distortion_ssim->min_all,
cur_distortion_ssim->min_frame);
cur_distortion_ssim->min_y, cur_distortion_ssim->min_u,
cur_distortion_ssim->min_v, cur_distortion_ssim->min_all,
cur_distortion_ssim->min_frame);
}
if (show_name) {
printf("\t%s", argv[fileindex_rec + cur_rec]);
@ -615,20 +591,20 @@ int main(int argc, const char* argv[]) {
}
if (do_mse) {
double global_mse_y = GetMSE(cur_distortion_psnr->global_y,
static_cast<double>(y_size) * number_of_frames);
double global_mse_u = GetMSE(cur_distortion_psnr->global_u,
static_cast<double>(uv_size) * number_of_frames);
double global_mse_v = GetMSE(cur_distortion_psnr->global_v,
static_cast<double>(uv_size) * number_of_frames);
double global_mse_all = GetMSE(cur_distortion_psnr->global_all,
static_cast<double>(total_size) * number_of_frames);
printf("MSE:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
global_mse_y,
global_mse_u,
global_mse_v,
global_mse_all,
number_of_frames);
double global_mse_y =
GetMSE(cur_distortion_psnr->global_y,
static_cast<double>(y_size) * number_of_frames);
double global_mse_u =
GetMSE(cur_distortion_psnr->global_u,
static_cast<double>(uv_size) * number_of_frames);
double global_mse_v =
GetMSE(cur_distortion_psnr->global_v,
static_cast<double>(uv_size) * number_of_frames);
double global_mse_all =
GetMSE(cur_distortion_psnr->global_all,
static_cast<double>(total_size) * number_of_frames);
printf("MSE:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", global_mse_y,
global_mse_u, global_mse_v, global_mse_all, number_of_frames);
if (show_name) {
printf("\t%s", argv[fileindex_rec + cur_rec]);
}

View File

@ -16,11 +16,11 @@
extern "C" {
#endif
typedef unsigned int uint32; // NOLINT
typedef unsigned short uint16; // NOLINT
typedef unsigned int uint32; // NOLINT
typedef unsigned short uint16; // NOLINT
#if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
(defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
(defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
#define __SSE2__
#endif
#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
@ -38,22 +38,29 @@ enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
// The maximum value (11 x 11) must be less than 128 to avoid sign
// problems during the calls to _mm_mullo_epi16().
static const int K[KERNEL_SIZE] = {
1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i)
1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i)
};
static const double kiW[KERNEL + 1 + 1] = {
1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j]
1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j]
1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j]
1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j]
1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j]
1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j]
};
#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
#define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)]) // weight product
#define MAKE_WEIGHT(L) \
{ { { PWEIGHT(L, 0), PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), \
PWEIGHT(L, 4), PWEIGHT(L, 5), PWEIGHT(L, 6), 0 } } }
#define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)]) // weight product
#define MAKE_WEIGHT(L) \
{ \
{ \
{ \
PWEIGHT(L, 0) \
, PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), PWEIGHT(L, 4), \
PWEIGHT(L, 5), PWEIGHT(L, 6), 0 \
} \
} \
}
// We need this union trick to be able to initialize constant static __m128i
// values. We can't call _mm_set_epi16() for static compile-time initialization.
@ -62,32 +69,36 @@ static const struct {
uint16 i16_[8];
__m128i m_;
} values_;
} W0 = MAKE_WEIGHT(0),
W1 = MAKE_WEIGHT(1),
W2 = MAKE_WEIGHT(2),
} W0 = MAKE_WEIGHT(0), W1 = MAKE_WEIGHT(1), W2 = MAKE_WEIGHT(2),
W3 = MAKE_WEIGHT(3);
// ... the rest is symmetric.
// ... the rest is symmetric.
#undef MAKE_WEIGHT
#undef PWEIGHT
#endif
// Common final expression for SSIM, once the weighted sums are known.
static double FinalizeSSIM(double iw, double xm, double ym,
double xxm, double xym, double yym) {
static double FinalizeSSIM(double iw,
double xm,
double ym,
double xxm,
double xym,
double yym) {
const double iwx = xm * iw;
const double iwy = ym * iw;
double sxx = xxm * iw - iwx * iwx;
double syy = yym * iw - iwy * iwy;
// small errors are possible, due to rounding. Clamp to zero.
if (sxx < 0.) sxx = 0.;
if (syy < 0.) syy = 0.;
if (sxx < 0.)
sxx = 0.;
if (syy < 0.)
syy = 0.;
const double sxsy = sqrt(sxx * syy);
const double sxy = xym * iw - iwx * iwy;
static const double C11 = (0.01 * 0.01) * (255 * 255);
static const double C22 = (0.03 * 0.03) * (255 * 255);
static const double C33 = (0.015 * 0.015) * (255 * 255);
const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
const double c = (2. * sxsy + C22) / (sxx + syy + C22);
const double c = (2. * sxsy + C22) / (sxx + syy + C22);
const double s = (sxy + C33) / (sxsy + C33);
return l * c * s;
}
@ -98,15 +109,21 @@ static double FinalizeSSIM(double iw, double xm, double ym,
// Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
// with a diff of 255, squared. The maximum error is thus 0x4388241,
// which fits into 32 bits integers.
double GetSSIM(const uint8 *org, const uint8 *rec,
int xo, int yo, int W, int H, int stride) {
double GetSSIM(const uint8* org,
const uint8* rec,
int xo,
int yo,
int W,
int H,
int stride) {
uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
org += (yo - KERNEL) * stride;
org += (xo - KERNEL);
rec += (yo - KERNEL) * stride;
rec += (xo - KERNEL);
for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) continue;
if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H))
continue;
const int Wy = K[y_];
for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
const int Wxy = Wy * K[x_];
@ -114,8 +131,8 @@ double GetSSIM(const uint8 *org, const uint8 *rec,
const int org_x = org[x_];
const int rec_x = rec[x_];
ws += Wxy;
xm += Wxy * org_x;
ym += Wxy * rec_x;
xm += Wxy * org_x;
ym += Wxy * rec_x;
xxm += Wxy * org_x * org_x;
xym += Wxy * org_x * rec_x;
yym += Wxy * rec_x * rec_x;
@ -125,8 +142,11 @@ double GetSSIM(const uint8 *org, const uint8 *rec,
return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
}
double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
int xo, int yo, int stride,
double GetSSIMFullKernel(const uint8* org,
const uint8* rec,
int xo,
int yo,
int stride,
double area_weight) {
uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
@ -161,8 +181,8 @@ double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
const int ll2 = rec[dy2 - x];
const int lr2 = rec[dy2 + x];
xm += Wxy * (ul1 + ur1 + ll1 + lr1);
ym += Wxy * (ul2 + ur2 + ll2 + lr2);
xm += Wxy * (ul1 + ur1 + ll1 + lr1);
ym += Wxy * (ul2 + ur2 + ll2 + lr2);
xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
@ -189,8 +209,8 @@ double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
const int l2 = rec[-y];
const int r2 = rec[y];
xm += Wxy * (u1 + d1 + l1 + r1);
ym += Wxy * (u2 + d2 + l2 + r2);
xm += Wxy * (u1 + d1 + l1 + r1);
ym += Wxy * (u2 + d2 + l2 + r2);
xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
@ -201,13 +221,13 @@ double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
const int s1 = org[0];
const int s2 = rec[0];
xm += Wxy * s1;
ym += Wxy * s2;
xm += Wxy * s1;
ym += Wxy * s2;
xxm += Wxy * s1 * s1;
xym += Wxy * s1 * s2;
yym += Wxy * s2 * s2;
#else // __SSE2__
#else // __SSE2__
org += (yo - KERNEL) * stride + (xo - KERNEL);
rec += (yo - KERNEL) * stride + (xo - KERNEL);
@ -221,29 +241,31 @@ double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
// Read 8 pixels at line #L, and convert to 16bit, perform weighting
// and acccumulate.
#define LOAD_LINE_PAIR(L, WEIGHT) do { \
const __m128i v0 = \
_mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \
const __m128i v1 = \
_mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \
const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \
const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \
const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \
const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \
x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \
y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \
x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \
y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \
xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \
xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \
yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \
} while (0)
#define LOAD_LINE_PAIR(L, WEIGHT) \
do { \
const __m128i v0 = \
_mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L)*stride)); \
const __m128i v1 = \
_mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L)*stride)); \
const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \
const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \
const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \
const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \
x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \
y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \
x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \
y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \
xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \
xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \
yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \
} while (0)
#define ADD_AND_STORE_FOUR_EPI32(M, OUT) do { \
uint32 tmp[4]; \
_mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
(OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \
} while (0)
#define ADD_AND_STORE_FOUR_EPI32(M, OUT) \
do { \
uint32 tmp[4]; \
_mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
(OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \
} while (0)
LOAD_LINE_PAIR(0, W0);
LOAD_LINE_PAIR(1, W1);
@ -266,10 +288,14 @@ double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
}
static int start_max(int x, int y) { return (x > y) ? x : y; }
static int start_max(int x, int y) {
return (x > y) ? x : y;
}
double CalcSSIM(const uint8 *org, const uint8 *rec,
const int image_width, const int image_height) {
double CalcSSIM(const uint8* org,
const uint8* rec,
const int image_width,
const int image_height) {
double SSIM = 0.;
const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
@ -284,7 +310,7 @@ double CalcSSIM(const uint8 *org, const uint8 *rec,
}
#ifdef _OPENMP
#pragma omp parallel for reduction(+: SSIM)
#pragma omp parallel for reduction(+ : SSIM)
#endif
for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
for (int i = 0; i < KERNEL_X; ++i) {
@ -302,8 +328,8 @@ double CalcSSIM(const uint8 *org, const uint8 *rec,
// NOTE: we could use similar method for the left-most pixels too.
const int kScratchWidth = 8;
const int kScratchStride = kScratchWidth + KERNEL + 1;
uint8 scratch_org[KERNEL_SIZE * kScratchStride] = { 0 };
uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = { 0 };
uint8 scratch_org[KERNEL_SIZE * kScratchStride] = {0};
uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = {0};
for (int k = 0; k < KERNEL_SIZE; ++k) {
const int offset =
@ -311,9 +337,9 @@ double CalcSSIM(const uint8 *org, const uint8 *rec,
memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
}
for (int k = 0; k <= KERNEL_X + 1; ++k) {
SSIM += GetSSIMFullKernel(scratch_org, scratch_rec,
KERNEL + k, KERNEL, kScratchStride, kiW[k]);
for (int k = 0; k <= KERNEL_X + 1; ++k) {
SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, KERNEL + k, KERNEL,
kScratchStride, kiW[k]);
}
}
}
@ -333,4 +359,3 @@ double CalcLSSIM(double ssim) {
#ifdef __cplusplus
} // extern "C"
#endif

View File

@ -24,8 +24,10 @@ typedef unsigned char uint8;
#define UINT8_TYPE_DEFINED
#endif
double CalcSSIM(const uint8* org, const uint8* rec,
const int image_width, const int image_height);
double CalcSSIM(const uint8* org,
const uint8* rec,
const int image_width,
const int image_height);
double CalcLSSIM(double ssim);