From aed1cc94c105736a5e6010e9c84cc32910b865d6 Mon Sep 17 00:00:00 2001 From: "mikhal@webrtc.org" Date: Wed, 28 Sep 2011 00:06:25 +0000 Subject: [PATCH] first draft git-svn-id: http://libyuv.googlecode.com/svn/trunk@2 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- common/basic_types.h | 108 ++ common/common.h | 121 ++ common/constructor_magic.h | 39 + include/convert.h | 169 +++ include/format_conversion.h | 36 + include/general.h | 61 + include/planar_functions.h | 61 + include/scale.h | 56 + libyuv.gyp | 56 + source/conversion_tables.h | 203 +++ source/convert.cc | 1467 ++++++++++++++++++ source/cpu_id.cc | 116 ++ source/cpu_id.h | 53 + source/format_conversion.cc | 488 ++++++ source/general.cc | 338 +++++ source/linux.cc | 350 +++++ source/linux.h | 118 ++ source/planar_functions.cc | 341 +++++ source/row.h | 73 + source/row_posix.cc | 163 ++ source/row_table.cc | 245 +++ source/scale.cc | 2848 +++++++++++++++++++++++++++++++++++ source/video_common.cc | 64 + source/video_common.h | 174 +++ 24 files changed, 7748 insertions(+) create mode 100644 common/basic_types.h create mode 100644 common/common.h create mode 100644 common/constructor_magic.h create mode 100644 include/convert.h create mode 100644 include/format_conversion.h create mode 100644 include/general.h create mode 100644 include/planar_functions.h create mode 100644 include/scale.h create mode 100644 libyuv.gyp create mode 100644 source/conversion_tables.h create mode 100644 source/convert.cc create mode 100644 source/cpu_id.cc create mode 100644 source/cpu_id.h create mode 100644 source/format_conversion.cc create mode 100644 source/general.cc create mode 100644 source/linux.cc create mode 100644 source/linux.h create mode 100644 source/planar_functions.cc create mode 100644 source/row.h create mode 100644 source/row_posix.cc create mode 100644 source/row_table.cc create mode 100644 source/scale.cc create mode 100644 source/video_common.cc create mode 100644 source/video_common.h diff --git a/common/basic_types.h b/common/basic_types.h new file mode 100644 index 000000000..a5e921847 --- /dev/null +++ b/common/basic_types.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef LIBYUV_COMMON_BASIC_TYPES_H_ +#define LIBYUV_COMMON_BASIC_TYPES_H_ + +#include // for NULL, size_t + +#ifndef WIN32 +#include // for uintptr_t +#endif + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "constructor_magic.h" + + +#ifndef INT_TYPES_DEFINED +#define INT_TYPES_DEFINED +#ifdef COMPILER_MSVC +typedef __int64 int64; +#else +typedef long long int64; +#endif /* COMPILER_MSVC */ +typedef int int32; +typedef short int16; +typedef char int8; + +#ifdef COMPILER_MSVC +typedef unsigned __int64 uint64; +typedef __int64 int64; +#ifndef INT64_C +#define INT64_C(x) x ## I64 +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## UI64 +#endif +#define INT64_F "I64" +#else +typedef unsigned long long uint64; +typedef long long int64; +#ifndef INT64_C +#define INT64_C(x) x ## LL +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## ULL +#endif +#define INT64_F "ll" +#endif /* COMPILER_MSVC */ +typedef unsigned int uint32; +typedef unsigned short uint16; +typedef unsigned char uint8; +#endif // INT_TYPES_DEFINED + +#ifdef WIN32 +typedef int socklen_t; +#endif + +namespace libyuv { + template inline T _min(T a, T b) { return (a > b) ? b : a; } + template inline T _max(T a, T b) { return (a < b) ? b : a; } + + // For wait functions that take a number of milliseconds, kForever indicates + // unlimited time. + const int kForever = -1; +} + +// Detect compiler is for x86 or x64. +#if defined(__x86_64__) || defined(_M_X64) || \ + defined(__i386__) || defined(_M_IX86) +#define CPU_X86 1 +#endif + +#ifdef WIN32 +#define alignof(t) __alignof(t) +#else // !WIN32 +#define alignof(t) __alignof__(t) +#endif // !WIN32 +#define IS_ALIGNED(p, a) (0==(reinterpret_cast(p) & ((a)-1))) +#define ALIGNP(p, t) \ + (reinterpret_cast(((reinterpret_cast(p) + \ + ((t)-1)) & ~((t)-1)))) + +#ifndef UNUSED +#define UNUSED(x) Unused(static_cast(&x)) +#define UNUSED2(x,y) Unused(static_cast(&x)); Unused(static_cast(&y)) +#define UNUSED3(x,y,z) Unused(static_cast(&x)); Unused(static_cast(&y)); Unused(static_cast(&z)) +#define UNUSED4(x,y,z,a) Unused(static_cast(&x)); Unused(static_cast(&y)); Unused(static_cast(&z)); Unused(static_cast(&a)) +#define UNUSED5(x,y,z,a,b) Unused(static_cast(&x)); Unused(static_cast(&y)); Unused(static_cast(&z)); Unused(static_cast(&a)); Unused(static_cast(&b)) +inline void Unused(const void *) { } +#endif // UNUSED + +#if defined(__GNUC__) +#define GCC_ATTR(x) __attribute__ ((x)) +#else // !__GNUC__ +#define GCC_ATTR(x) +#endif // !__GNUC__ + +#endif // LIBYUV_COMMON_BASIC_TYPES_H_ diff --git a/common/common.h b/common/common.h new file mode 100644 index 000000000..a4163de15 --- /dev/null +++ b/common/common.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef LIBYUV_SOURCE_COMMON_H_ +#define LIBYUV_SOURCE_COMMON_H_ + +#include "constructor_magic.h" + +#if defined(_MSC_VER) +// warning C4355: 'this' : used in base member initializer list +#pragma warning(disable:4355) +#endif + +////////////////////////////////////////////////////////////////////// +// General Utilities +////////////////////////////////////////////////////////////////////// + +#ifndef UNUSED +#define UNUSED(x) Unused(static_cast(&x)) +#define UNUSED2(x,y) Unused(static_cast(&x)); Unused(static_cast(&y)) +#define UNUSED3(x,y,z) Unused(static_cast(&x)); Unused(static_cast(&y)); Unused(static_cast(&z)) +#define UNUSED4(x,y,z,a) Unused(static_cast(&x)); Unused(static_cast(&y)); Unused(static_cast(&z)); Unused(static_cast(&a)) +#define UNUSED5(x,y,z,a,b) Unused(static_cast(&x)); Unused(static_cast(&y)); Unused(static_cast(&z)); Unused(static_cast(&a)); Unused(static_cast(&b)) +inline void Unused(const void *) { } +#endif // UNUSED + +#ifndef WIN32 +#define strnicmp(x,y,n) strncasecmp(x,y,n) +#define stricmp(x,y) strcasecmp(x,y) + +// TODO(sergeyu): Remove this. std::max should be used everywhere in the code. +// NOMINMAX must be defined where we include . +#define stdmax(x,y) std::max(x,y) +#else +#define stdmax(x,y) libyuv::_max(x,y) +#endif + + +#define ARRAY_SIZE(x) (static_cast((sizeof(x)/sizeof(x[0])))) + +///////////////////////////////////////////////////////////////////////////// +// Assertions +///////////////////////////////////////////////////////////////////////////// + +#ifndef ENABLE_DEBUG +#define ENABLE_DEBUG _DEBUG +#endif // !defined(ENABLE_DEBUG) + +#if ENABLE_DEBUG + +namespace libyuv { + +// Break causes the debugger to stop executing, or the program to abort +void Break(); + +// LogAssert writes information about an assertion to the log +void LogAssert(const char * function, const char * file, int line, + const char * expression); + +inline bool Assert(bool result, const char * function, const char * file, + int line, const char * expression) { + if (!result) { + LogAssert(function, file, line, expression); + Break(); + return false; + } + return true; +} + +} // namespace libyuv + +#if defined(_MSC_VER) && _MSC_VER < 1300 +#define __FUNCTION__ "" +#endif + +#ifndef ASSERT +#define ASSERT(x) (void)libyuv::Assert((x),__FUNCTION__,__FILE__,__LINE__,#x) +#endif + +#ifndef VERIFY +#define VERIFY(x) libyuv::Assert((x),__FUNCTION__,__FILE__,__LINE__,#x) +#endif + +#else // !ENABLE_DEBUG + +namespace libyuv { + +inline bool libyuv(bool result) { return result; } + +} // namespace libyuv + +#ifndef ASSERT +#define ASSERT(x) (void)0 +#endif + +#ifndef VERIFY +#define VERIFY(x) libyuv::ImplicitCastToBool(x) +#endif + +#endif // !ENABLE_DEBUG + +#define COMPILE_TIME_ASSERT(expr) char CTA_UNIQUE_NAME[expr] +#define CTA_UNIQUE_NAME CTA_MAKE_NAME(__LINE__) +#define CTA_MAKE_NAME(line) CTA_MAKE_NAME2(line) +#define CTA_MAKE_NAME2(line) constraint_ ## line + +#ifdef __GNUC__ +// Forces compiler to inline, even against its better judgement. Use wisely. +#define FORCE_INLINE __attribute__((always_inline)) +#else +#define FORCE_INLINE +#endif + +#endif // LIBYUV_SOURCE_COMMON_H_ diff --git a/common/constructor_magic.h b/common/constructor_magic.h new file mode 100644 index 000000000..2a5dd1267 --- /dev/null +++ b/common/constructor_magic.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef LIBYUV_COMMON_CONSTRUCTOR_MAGIC_H_ +#define LIBYUV_COMMON_CONSTRUCTOR_MAGIC_H_ + +#define DISALLOW_ASSIGN(TypeName) \ + void operator=(const TypeName&) + +// A macro to disallow the evil copy constructor and operator= functions +// This should be used in the private: declarations for a class +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&); \ + DISALLOW_ASSIGN(TypeName) + +// Alternative, less-accurate legacy name. +#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \ + DISALLOW_COPY_AND_ASSIGN(TypeName) + +// A macro to disallow all the implicit constructors, namely the +// default constructor, copy constructor and operator= functions. +// +// This should be used in the private: declarations for a class +// that wants to prevent anyone from instantiating it. This is +// especially useful for classes containing only static methods. +#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \ + TypeName(); \ + DISALLOW_EVIL_CONSTRUCTORS(TypeName) + + +#endif // LIBYUV_COMMON_CONSTRUCTOR_MAGIC_H_ diff --git a/include/convert.h b/include/convert.h new file mode 100644 index 000000000..5952a4484 --- /dev/null +++ b/include/convert.h @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef LIBYUV_INCLUDE_CONVERT_H_ +#define LIBYUV_INCLUDE_CONVERT_H_ + +#include "basic_types.h" + +namespace libyuv +{ + +int +ConvertI420ToRGB24(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ); + +int +ConvertI420ToARGB(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ); + +int +ConvertI420ToARGB4444(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ); + +int +ConvertI420ToRGB565(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ); + +int +ConvertI420ToRGB565Android(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ); + +int +ConvertI420ToARGB1555(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ); + +int +ConvertYV12ToARGB(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ); + + +int +ConvertYV12ToRGBA(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ); + +int +ConvertI420ToYUY2(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ); +int +ConvertUYVYToI420(const uint8* src_frame, int src_stride, + uint8* dst_yplane, int dst_ystride, + uint8* dst_uplane, int dst_ustride, + uint8* dst_vplane, int dst_vstride, + int src_width, + int src_height); + +int +ConvertI420ToYUY2(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ); +int +ConvertI420ToYV12(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ); + +int +ConvertRGB24ToARGB(const uint8* src_frame, int src_stride, + uint8* dst_frame, int dst_stride, + int src_width, int src_height + ); + +int +ConvertRGB24ToI420(const uint8* src_frame, int src_stride, + uint8* dst_yplane, int dst_ystride, + uint8* dst_uplane, int dst_ustride, + uint8* dst_vplane, int dst_vstride, + int src_width, + int src_height + ); + +int +ConvertABGRToI420(const uint8* src_frame, int src_stride, + uint8* dst_yplane, int dst_ystride, + uint8* dst_uplane, int dst_ustride, + uint8* dst_vplane, int dst_vstride, + int src_width, + int src_height + ); + +int +ConvertNv12ToRGB565(const uint8* src_yplane, int src_ystride, + const uint8* src_uvplane, int src_uvstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ); + +int +ConvertI420ToABGR(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ); + +} // namespace libyuv + +#endif // LIBYUV_INCLUDE_CONVERT_H_ diff --git a/include/format_conversion.h b/include/format_conversion.h new file mode 100644 index 000000000..e93532046 --- /dev/null +++ b/include/format_conversion.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef LIBYUV_INCLUDE_FORMAT_CONVERSION_H_ +#define LIBYUV_INCLUDE_FORMAT_CONVERSION_H_ + +#include "basic_types.h" + +namespace libyuv { + +// Converts any Bayer RGB format to I420. +void BayerRGBToI420(const uint8* src_bayer, int src_pitch_bayer, + uint32 src_fourcc_bayer, + uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + int width, int height); + +// Converts any 32 bit ARGB to any Bayer RGB format. +void RGB32ToBayerRGB(const uint8* src_rgb, int src_pitch_rgb, + uint32 src_fourcc_rgb, + uint8* dst_bayer, int dst_pitch_bayer, + uint32 dst_fourcc_bayer, + int width, int height); + + +} // namespace libyuv + +#endif // LIBYUV_INCLUDE_FORMAT_CONVERSION_H_ diff --git a/include/general.h b/include/general.h new file mode 100644 index 000000000..b7227bf9e --- /dev/null +++ b/include/general.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* + * General operations on YUV images. + */ + +#ifndef LIBYUV_INCLUDE_GENERAL_H_ +#define LIBYUV_INCLUDE_GENERAL_H_ + +#include "basic_types.h" + +namespace libyuv { + +// Supported rotation +enum VideoRotationMode +{ + kRotateNone = 0, + kRotateClockwise = 90, + kRotateAntiClockwise = -90, + kRotate180 = 180, +}; + +// Mirror functions +// The following 2 functions perform mirroring on an image (LeftRight/UpDown) +// Input: +// - width : Image width in pixels. +// - height : Image height in pixels. +// - inFrame : Reference to input image. +// - outFrame : Reference to converted image. +// Return value: 0 if OK, < 0 otherwise. +int +MirrorI420LeftRight(const uint8* src_frame, int src_stride, + uint8* dst_frame, int dst_stride, + int src_width, int src_height); + +// Cut/Pad I420 frame to match desird dimensions. +int +CutPadI420Frame(const uint8* inFrame, int inWidth, + int inHeight, uint8* outFrame, + int outWidth, int outHeight); + +// I420 Cut - make a center cut +int +CutI420Frame(uint8* frame, int fromWidth, + int fromHeight, int toWidth, + int toHeight); + + +} // namespace libyuv + + +#endif // LIBYUV_INCLUDE_GENERAL_H_ diff --git a/include/planar_functions.h b/include/planar_functions.h new file mode 100644 index 000000000..469b31d37 --- /dev/null +++ b/include/planar_functions.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef LIBYUV_INCLUDE_PLANAR_FUNCTIONS_H_ +#define LIBYUV_INCLUDE_PLANAR_FUNCTIONS_H_ + +#include "basic_types.h" + +namespace libyuv { + +class PlanarFunctions { + public: + + // Copy I420 to I420. + static void I420Copy(const uint8* src_y, int src_pitch_y, + const uint8* src_u, int src_pitch_u, + const uint8* src_v, int src_pitch_v, + uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + int width, int height); + + // Convert I422 to I420. Used by MJPG. + static void I422ToI420(const uint8* src_y, int src_pitch_y, + const uint8* src_u, int src_pitch_u, + const uint8* src_v, int src_pitch_v, + uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + int width, int height); + + // Convert M420 to I420. + static void M420ToI420(uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + const uint8* m420, int pitch_m420, + int width, int height); + + // Convert NV12 to I420. Also used for NV21. + static void NV12ToI420(uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + const uint8* src_y, + const uint8* src_uv, + int src_pitch, + int width, int height); + + DISALLOW_IMPLICIT_CONSTRUCTORS(PlanarFunctions); +}; + +} // namespace libyuv + +#endif // LIBYUV_INCLUDE_PLANAR_FUNCTIONS_H_ diff --git a/include/scale.h b/include/scale.h new file mode 100644 index 000000000..c795ff731 --- /dev/null +++ b/include/scale.h @@ -0,0 +1,56 @@ + + + +#ifndef LIBYUV_INCLUDE_SCALE_H_ +#define LIBYUV_INCLUDE_SCALE_H_ + + +#include "basic_types.h" + +#if defined(_MSC_VER) +#define ALIGN16(var) __declspec(align(16)) var +#else +#define ALIGN16(var) var __attribute__((aligned(16))) +#endif + + +namespace libyuv +{ + +class YuvScaler { + public: + // Scales a YUV 4:2:0 image from the input width and height to the + // output width and height. If outh_offset is nonzero, the image is + // offset by that many pixels and stretched to (outh - outh_offset * 2) + // pixels high, instead of outh. + // If interpolate is not set, a simple nearest-neighbor algorithm is + // used. This produces basic (blocky) quality at the fastest speed. + // If interpolate is set, interpolation is used to produce a better + // quality image, at the expense of speed. + // Returns true if successful. + static bool Scale(const uint8 *in, int32 inw, int32 inh, + uint8 *out, int32 outw, int32 outh, int32 outh_offset, + bool interpolate); + + // Same, but specified in terms of each plane location and stride. + static bool Scale(const uint8 *inY, const uint8 *inU, const uint8 *inV, + int32 istrideY, int32 istrideU, int32 istrideV, + int32 iwidth, int32 iheight, + uint8 *outY, uint8 *outU, uint8 *outV, + int32 ostrideY, int32 ostrideU, int32 ostrideV, + int32 owidth, int32 oheight, + bool interpolate); + + // For testing, allow disabling of optimizations. + static void SetUseReferenceImpl(bool use) { use_reference_impl_ = use; } + + private: + + static bool use_reference_impl_; + + DISALLOW_IMPLICIT_CONSTRUCTORS(YuvScaler); +}; + +} // namespace libyuv + +#endif // LIBYUV_INCLUDE_SCALE_H_ diff --git a/libyuv.gyp b/libyuv.gyp new file mode 100644 index 000000000..4bee6d1eb --- /dev/null +++ b/libyuv.gyp @@ -0,0 +1,56 @@ +# Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +{ + 'targets': [ + { + 'target_name': 'libyuv', + 'type': 'static_library', + 'dependencies': [ + ], + 'include_dirs': [ + 'include', + 'common', + + ], + 'sources': [ + # includes + 'include/convert.h', + 'include/general.h', + 'include/scale.h', + 'include/planar_functions.h', + + # headers + 'common/basic_types.h', + 'common/common.h', + 'common/constructor_magic.h', + 'source/cpu_id.h', + 'source/row.h', + 'source/video_common.h', + + # sources + 'source/convert.cc', + 'source/general.cc', + 'source/scale.cc', + 'source/cpu_id.cc', + 'source/format_conversion.cc', + 'source/planar_functions.cc', + 'source/row_posix.cc', + 'source/row_table.cc', + 'source/video_common.cc', + + ], + }, + ], # targets +} + +# Local Variables: +# tab-width:2 +# indent-tabs-mode:nil +# End: +# vim: set expandtab tabstop=2 shiftwidth=2: diff --git a/source/conversion_tables.h b/source/conversion_tables.h new file mode 100644 index 000000000..e778fa2d0 --- /dev/null +++ b/source/conversion_tables.h @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/************************************************************** +* conversion_tables.h +* +* Pre-compiled definitions of the conversion equations: YUV -> RGB. +* +***************************************************************/ + +#ifndef WEBRTC_COMMON_VIDEO_VPLIB_CONVERSION_TABLES +#define WEBRTC_COMMON_VIDEO_VPLIB_CONVERSION_TABLES + +namespace libyuv +{ +/****************************************************************************** +* YUV TO RGB approximation +* +* R = clip( (298 * (Y - 16) + 409 * (V - 128) + 128 ) >> 8 ) +* G = clip( (298 * (Y - 16) - 100 * (U - 128) - 208 * (V - 128) + 128 ) >> 8 ) +* B = clip( (298 * (Y - 16) + 516 * (U - 128) + 128 ) >> 8 ) +*******************************************************************************/ + + #define Yc(i) static_cast ( 298 * ( i - 16 )) // Y contribution + #define Ucg(i) static_cast ( -100 * ( i - 128 ))// U contribution to G + #define Ucb(i) static_cast ( 516 * ( i - 128 ))// U contribution to B + #define Vcr(i) static_cast ( 409 * ( i - 128 ))// V contribution to R + #define Vcg(i) static_cast ( -208 * ( i - 128 ))// V contribution to G + + static const int mapYc[256] = { + Yc(0),Yc(1),Yc(2),Yc(3),Yc(4),Yc(5),Yc(6),Yc(7),Yc(8),Yc(9), + Yc(10),Yc(11),Yc(12),Yc(13),Yc(14),Yc(15),Yc(16),Yc(17),Yc(18),Yc(19), + Yc(20),Yc(21),Yc(22),Yc(23),Yc(24),Yc(25),Yc(26),Yc(27),Yc(28),Yc(29), + Yc(30),Yc(31),Yc(32),Yc(33),Yc(34),Yc(35),Yc(36),Yc(37),Yc(38),Yc(39), + Yc(40),Yc(41),Yc(42),Yc(43),Yc(44),Yc(45),Yc(46),Yc(47),Yc(48),Yc(49), + Yc(50),Yc(51),Yc(52),Yc(53),Yc(54),Yc(55),Yc(56),Yc(57),Yc(58),Yc(59), + Yc(60),Yc(61),Yc(62),Yc(63),Yc(64),Yc(65),Yc(66),Yc(67),Yc(68),Yc(69), + Yc(70),Yc(71),Yc(72),Yc(73),Yc(74),Yc(75),Yc(76),Yc(77),Yc(78),Yc(79), + Yc(80),Yc(81),Yc(82),Yc(83),Yc(84),Yc(85),Yc(86),Yc(87),Yc(88),Yc(89), + Yc(90),Yc(91),Yc(92),Yc(93),Yc(94),Yc(95),Yc(96),Yc(97),Yc(98),Yc(99), + Yc(100),Yc(101),Yc(102),Yc(103),Yc(104),Yc(105),Yc(106),Yc(107),Yc(108), + Yc(109),Yc(110),Yc(111),Yc(112),Yc(113),Yc(114),Yc(115),Yc(116),Yc(117), + Yc(118),Yc(119),Yc(120),Yc(121),Yc(122),Yc(123),Yc(124),Yc(125),Yc(126), + Yc(127),Yc(128),Yc(129),Yc(130),Yc(131),Yc(132),Yc(133),Yc(134),Yc(135), + Yc(136),Yc(137),Yc(138),Yc(139),Yc(140),Yc(141),Yc(142),Yc(143),Yc(144), + Yc(145),Yc(146),Yc(147),Yc(148),Yc(149),Yc(150),Yc(151),Yc(152),Yc(153), + Yc(154),Yc(155),Yc(156),Yc(157),Yc(158),Yc(159),Yc(160),Yc(161),Yc(162), + Yc(163),Yc(164),Yc(165),Yc(166),Yc(167),Yc(168),Yc(169),Yc(170),Yc(171), + Yc(172),Yc(173),Yc(174),Yc(175),Yc(176),Yc(177),Yc(178),Yc(179),Yc(180), + Yc(181),Yc(182),Yc(183),Yc(184),Yc(185),Yc(186),Yc(187),Yc(188),Yc(189), + Yc(190),Yc(191),Yc(192),Yc(193),Yc(194),Yc(195),Yc(196),Yc(197),Yc(198), + Yc(199),Yc(200),Yc(201),Yc(202),Yc(203),Yc(204),Yc(205),Yc(206),Yc(207), + Yc(208),Yc(209),Yc(210),Yc(211),Yc(212),Yc(213),Yc(214),Yc(215),Yc(216), + Yc(217),Yc(218),Yc(219),Yc(220),Yc(221),Yc(222),Yc(223),Yc(224),Yc(225), + Yc(226),Yc(227),Yc(228),Yc(229),Yc(230),Yc(231),Yc(232),Yc(233),Yc(234), + Yc(235),Yc(236),Yc(237),Yc(238),Yc(239),Yc(240),Yc(241),Yc(242),Yc(243), + Yc(244),Yc(245),Yc(246),Yc(247),Yc(248),Yc(249),Yc(250),Yc(251),Yc(252), + Yc(253),Yc(254),Yc(255)}; + + static const int mapUcg[256] = { + Ucg(0),Ucg(1),Ucg(2),Ucg(3),Ucg(4),Ucg(5),Ucg(6),Ucg(7),Ucg(8),Ucg(9), + Ucg(10),Ucg(11),Ucg(12),Ucg(13),Ucg(14),Ucg(15),Ucg(16),Ucg(17),Ucg(18), + Ucg(19),Ucg(20),Ucg(21),Ucg(22),Ucg(23),Ucg(24),Ucg(25),Ucg(26),Ucg(27), + Ucg(28),Ucg(29),Ucg(30),Ucg(31),Ucg(32),Ucg(33),Ucg(34),Ucg(35),Ucg(36), + Ucg(37),Ucg(38),Ucg(39),Ucg(40),Ucg(41),Ucg(42),Ucg(43),Ucg(44),Ucg(45), + Ucg(46),Ucg(47),Ucg(48),Ucg(49),Ucg(50),Ucg(51),Ucg(52),Ucg(53),Ucg(54), + Ucg(55),Ucg(56),Ucg(57),Ucg(58),Ucg(59),Ucg(60),Ucg(61),Ucg(62),Ucg(63), + Ucg(64),Ucg(65),Ucg(66),Ucg(67),Ucg(68),Ucg(69),Ucg(70),Ucg(71),Ucg(72), + Ucg(73),Ucg(74),Ucg(75),Ucg(76),Ucg(77),Ucg(78),Ucg(79),Ucg(80),Ucg(81), + Ucg(82),Ucg(83),Ucg(84),Ucg(85),Ucg(86),Ucg(87),Ucg(88),Ucg(89),Ucg(90), + Ucg(91),Ucg(92),Ucg(93),Ucg(94),Ucg(95),Ucg(96),Ucg(97),Ucg(98),Ucg(99), + Ucg(100),Ucg(101),Ucg(102),Ucg(103),Ucg(104),Ucg(105),Ucg(106),Ucg(107), + Ucg(108),Ucg(109),Ucg(110),Ucg(111),Ucg(112),Ucg(113),Ucg(114),Ucg(115), + Ucg(116),Ucg(117),Ucg(118),Ucg(119),Ucg(120),Ucg(121),Ucg(122),Ucg(123), + Ucg(124),Ucg(125),Ucg(126),Ucg(127),Ucg(128),Ucg(129),Ucg(130),Ucg(131), + Ucg(132),Ucg(133),Ucg(134),Ucg(135),Ucg(136),Ucg(137),Ucg(138),Ucg(139), + Ucg(140),Ucg(141),Ucg(142),Ucg(143),Ucg(144),Ucg(145),Ucg(146),Ucg(147), + Ucg(148),Ucg(149),Ucg(150),Ucg(151),Ucg(152),Ucg(153),Ucg(154),Ucg(155), + Ucg(156),Ucg(157),Ucg(158),Ucg(159),Ucg(160),Ucg(161),Ucg(162),Ucg(163), + Ucg(164),Ucg(165),Ucg(166),Ucg(167),Ucg(168),Ucg(169),Ucg(170),Ucg(171), + Ucg(172),Ucg(173),Ucg(174),Ucg(175),Ucg(176),Ucg(177),Ucg(178),Ucg(179), + Ucg(180),Ucg(181),Ucg(182),Ucg(183),Ucg(184),Ucg(185),Ucg(186),Ucg(187), + Ucg(188),Ucg(189),Ucg(190),Ucg(191),Ucg(192),Ucg(193),Ucg(194),Ucg(195), + Ucg(196),Ucg(197),Ucg(198),Ucg(199),Ucg(200),Ucg(201),Ucg(202),Ucg(203), + Ucg(204),Ucg(205),Ucg(206),Ucg(207),Ucg(208),Ucg(209),Ucg(210),Ucg(211), + Ucg(212),Ucg(213),Ucg(214),Ucg(215),Ucg(216),Ucg(217),Ucg(218),Ucg(219), + Ucg(220),Ucg(221),Ucg(222),Ucg(223),Ucg(224),Ucg(225),Ucg(226),Ucg(227), + Ucg(228),Ucg(229),Ucg(230),Ucg(231),Ucg(232),Ucg(233),Ucg(234),Ucg(235), + Ucg(236),Ucg(237),Ucg(238),Ucg(239),Ucg(240),Ucg(241),Ucg(242),Ucg(243), + Ucg(244),Ucg(245),Ucg(246),Ucg(247),Ucg(248),Ucg(249),Ucg(250),Ucg(251), + Ucg(252),Ucg(253),Ucg(254),Ucg(255)}; + + + static const int mapUcb[256] = { + Ucb(0),Ucb(1),Ucb(2),Ucb(3),Ucb(4),Ucb(5),Ucb(6),Ucb(7),Ucb(8),Ucb(9), + Ucb(10),Ucb(11),Ucb(12),Ucb(13),Ucb(14),Ucb(15),Ucb(16),Ucb(17),Ucb(18), + Ucb(19),Ucb(20),Ucb(21),Ucb(22),Ucb(23),Ucb(24),Ucb(25),Ucb(26),Ucb(27), + Ucb(28),Ucb(29),Ucb(30),Ucb(31),Ucb(32),Ucb(33),Ucb(34),Ucb(35),Ucb(36), + Ucb(37),Ucb(38),Ucb(39),Ucb(40),Ucb(41),Ucb(42),Ucb(43),Ucb(44),Ucb(45), + Ucb(46),Ucb(47),Ucb(48),Ucb(49),Ucb(50),Ucb(51),Ucb(52),Ucb(53),Ucb(54), + Ucb(55),Ucb(56),Ucb(57),Ucb(58),Ucb(59),Ucb(60),Ucb(61),Ucb(62),Ucb(63), + Ucb(64),Ucb(65),Ucb(66),Ucb(67),Ucb(68),Ucb(69),Ucb(70),Ucb(71),Ucb(72), + Ucb(73),Ucb(74),Ucb(75),Ucb(76),Ucb(77),Ucb(78),Ucb(79),Ucb(80),Ucb(81), + Ucb(82),Ucb(83),Ucb(84),Ucb(85),Ucb(86),Ucb(87),Ucb(88),Ucb(89),Ucb(90), + Ucb(91),Ucb(92),Ucb(93),Ucb(94),Ucb(95),Ucb(96),Ucb(97),Ucb(98),Ucb(99), + Ucb(100),Ucb(101),Ucb(102),Ucb(103),Ucb(104),Ucb(105),Ucb(106),Ucb(107), + Ucb(108),Ucb(109),Ucb(110),Ucb(111),Ucb(112),Ucb(113),Ucb(114),Ucb(115), + Ucb(116),Ucb(117),Ucb(118),Ucb(119),Ucb(120),Ucb(121),Ucb(122),Ucb(123), + Ucb(124),Ucb(125),Ucb(126),Ucb(127),Ucb(128),Ucb(129),Ucb(130),Ucb(131), + Ucb(132),Ucb(133),Ucb(134),Ucb(135),Ucb(136),Ucb(137),Ucb(138),Ucb(139), + Ucb(140),Ucb(141),Ucb(142),Ucb(143),Ucb(144),Ucb(145),Ucb(146),Ucb(147), + Ucb(148),Ucb(149),Ucb(150),Ucb(151),Ucb(152),Ucb(153),Ucb(154),Ucb(155), + Ucb(156),Ucb(157),Ucb(158),Ucb(159),Ucb(160),Ucb(161),Ucb(162),Ucb(163), + Ucb(164),Ucb(165),Ucb(166),Ucb(167),Ucb(168),Ucb(169),Ucb(170),Ucb(171), + Ucb(172),Ucb(173),Ucb(174),Ucb(175),Ucb(176),Ucb(177),Ucb(178),Ucb(179), + Ucb(180),Ucb(181),Ucb(182),Ucb(183),Ucb(184),Ucb(185),Ucb(186),Ucb(187), + Ucb(188),Ucb(189),Ucb(190),Ucb(191),Ucb(192),Ucb(193),Ucb(194),Ucb(195), + Ucb(196),Ucb(197),Ucb(198),Ucb(199),Ucb(200),Ucb(201),Ucb(202),Ucb(203), + Ucb(204),Ucb(205),Ucb(206),Ucb(207),Ucb(208),Ucb(209),Ucb(210),Ucb(211), + Ucb(212),Ucb(213),Ucb(214),Ucb(215),Ucb(216),Ucb(217),Ucb(218),Ucb(219), + Ucb(220),Ucb(221),Ucb(222),Ucb(223),Ucb(224),Ucb(225),Ucb(226),Ucb(227), + Ucb(228),Ucb(229),Ucb(230),Ucb(231),Ucb(232),Ucb(233),Ucb(234),Ucb(235), + Ucb(236),Ucb(237),Ucb(238),Ucb(239),Ucb(240),Ucb(241),Ucb(242),Ucb(243), + Ucb(244),Ucb(245),Ucb(246),Ucb(247),Ucb(248),Ucb(249),Ucb(250),Ucb(251), + Ucb(252),Ucb(253),Ucb(254),Ucb(255)}; + + static const int mapVcr[256] = { + Vcr(0),Vcr(1),Vcr(2),Vcr(3),Vcr(4),Vcr(5),Vcr(6),Vcr(7),Vcr(8),Vcr(9), + Vcr(10),Vcr(11),Vcr(12),Vcr(13),Vcr(14),Vcr(15),Vcr(16),Vcr(17),Vcr(18), + Vcr(19),Vcr(20),Vcr(21),Vcr(22),Vcr(23),Vcr(24),Vcr(25),Vcr(26),Vcr(27), + Vcr(28),Vcr(29),Vcr(30),Vcr(31),Vcr(32),Vcr(33),Vcr(34),Vcr(35),Vcr(36), + Vcr(37),Vcr(38),Vcr(39),Vcr(40),Vcr(41),Vcr(42),Vcr(43),Vcr(44),Vcr(45), + Vcr(46),Vcr(47),Vcr(48),Vcr(49),Vcr(50),Vcr(51),Vcr(52),Vcr(53),Vcr(54), + Vcr(55),Vcr(56),Vcr(57),Vcr(58),Vcr(59),Vcr(60),Vcr(61),Vcr(62),Vcr(63), + Vcr(64),Vcr(65),Vcr(66),Vcr(67),Vcr(68),Vcr(69),Vcr(70),Vcr(71),Vcr(72), + Vcr(73),Vcr(74),Vcr(75),Vcr(76),Vcr(77),Vcr(78),Vcr(79),Vcr(80),Vcr(81), + Vcr(82),Vcr(83),Vcr(84),Vcr(85),Vcr(86),Vcr(87),Vcr(88),Vcr(89),Vcr(90), + Vcr(91),Vcr(92),Vcr(93),Vcr(94),Vcr(95),Vcr(96),Vcr(97),Vcr(98),Vcr(99), + Vcr(100),Vcr(101),Vcr(102),Vcr(103),Vcr(104),Vcr(105),Vcr(106),Vcr(107), + Vcr(108),Vcr(109),Vcr(110),Vcr(111),Vcr(112),Vcr(113),Vcr(114),Vcr(115), + Vcr(116),Vcr(117),Vcr(118),Vcr(119),Vcr(120),Vcr(121),Vcr(122),Vcr(123), + Vcr(124),Vcr(125),Vcr(126),Vcr(127),Vcr(128),Vcr(129),Vcr(130),Vcr(131), + Vcr(132),Vcr(133),Vcr(134),Vcr(135),Vcr(136),Vcr(137),Vcr(138),Vcr(139), + Vcr(140),Vcr(141),Vcr(142),Vcr(143),Vcr(144),Vcr(145),Vcr(146),Vcr(147), + Vcr(148),Vcr(149),Vcr(150),Vcr(151),Vcr(152),Vcr(153),Vcr(154),Vcr(155), + Vcr(156),Vcr(157),Vcr(158),Vcr(159),Vcr(160),Vcr(161),Vcr(162),Vcr(163), + Vcr(164),Vcr(165),Vcr(166),Vcr(167),Vcr(168),Vcr(169),Vcr(170),Vcr(171), + Vcr(172),Vcr(173),Vcr(174),Vcr(175),Vcr(176),Vcr(177),Vcr(178),Vcr(179), + Vcr(180),Vcr(181),Vcr(182),Vcr(183),Vcr(184),Vcr(185),Vcr(186),Vcr(187), + Vcr(188),Vcr(189),Vcr(190),Vcr(191),Vcr(192),Vcr(193),Vcr(194),Vcr(195), + Vcr(196),Vcr(197),Vcr(198),Vcr(199),Vcr(200),Vcr(201),Vcr(202),Vcr(203), + Vcr(204),Vcr(205),Vcr(206),Vcr(207),Vcr(208),Vcr(209),Vcr(210),Vcr(211), + Vcr(212),Vcr(213),Vcr(214),Vcr(215),Vcr(216),Vcr(217),Vcr(218),Vcr(219), + Vcr(220),Vcr(221),Vcr(222),Vcr(223),Vcr(224),Vcr(225),Vcr(226),Vcr(227), + Vcr(228),Vcr(229),Vcr(230),Vcr(231),Vcr(232),Vcr(233),Vcr(234),Vcr(235), + Vcr(236),Vcr(237),Vcr(238),Vcr(239),Vcr(240),Vcr(241),Vcr(242),Vcr(243), + Vcr(244),Vcr(245),Vcr(246),Vcr(247),Vcr(248),Vcr(249),Vcr(250),Vcr(251), + Vcr(252),Vcr(253),Vcr(254),Vcr(255)}; + + + static const int mapVcg[256] = { + Vcg(0),Vcg(1),Vcg(2),Vcg(3),Vcg(4),Vcg(5),Vcg(6),Vcg(7),Vcg(8),Vcg(9), + Vcg(10),Vcg(11),Vcg(12),Vcg(13),Vcg(14),Vcg(15),Vcg(16),Vcg(17),Vcg(18), + Vcg(19),Vcg(20),Vcg(21),Vcg(22),Vcg(23),Vcg(24),Vcg(25),Vcg(26),Vcg(27), + Vcg(28),Vcg(29),Vcg(30),Vcg(31),Vcg(32),Vcg(33),Vcg(34),Vcg(35),Vcg(36), + Vcg(37),Vcg(38),Vcg(39),Vcg(40),Vcg(41),Vcg(42),Vcg(43),Vcg(44),Vcg(45), + Vcg(46),Vcg(47),Vcg(48),Vcg(49),Vcg(50),Vcg(51),Vcg(52),Vcg(53),Vcg(54), + Vcg(55),Vcg(56),Vcg(57),Vcg(58),Vcg(59),Vcg(60),Vcg(61),Vcg(62),Vcg(63), + Vcg(64),Vcg(65),Vcg(66),Vcg(67),Vcg(68),Vcg(69),Vcg(70),Vcg(71),Vcg(72), + Vcg(73),Vcg(74),Vcg(75),Vcg(76),Vcg(77),Vcg(78),Vcg(79),Vcg(80),Vcg(81), + Vcg(82),Vcg(83),Vcg(84),Vcg(85),Vcg(86),Vcg(87),Vcg(88),Vcg(89),Vcg(90), + Vcg(91),Vcg(92),Vcg(93),Vcg(94),Vcg(95),Vcg(96),Vcg(97),Vcg(98),Vcg(99), + Vcg(100),Vcg(101),Vcg(102),Vcg(103),Vcg(104),Vcg(105),Vcg(106),Vcg(107), + Vcg(108),Vcg(109),Vcg(110),Vcg(111),Vcg(112),Vcg(113),Vcg(114),Vcg(115), + Vcg(116),Vcg(117),Vcg(118),Vcg(119),Vcg(120),Vcg(121),Vcg(122),Vcg(123), + Vcg(124),Vcg(125),Vcg(126),Vcg(127),Vcg(128),Vcg(129),Vcg(130),Vcg(131), + Vcg(132),Vcg(133),Vcg(134),Vcg(135),Vcg(136),Vcg(137),Vcg(138),Vcg(139), + Vcg(140),Vcg(141),Vcg(142),Vcg(143),Vcg(144),Vcg(145),Vcg(146),Vcg(147), + Vcg(148),Vcg(149),Vcg(150),Vcg(151),Vcg(152),Vcg(153),Vcg(154),Vcg(155), + Vcg(156),Vcg(157),Vcg(158),Vcg(159),Vcg(160),Vcg(161),Vcg(162),Vcg(163), + Vcg(164),Vcg(165),Vcg(166),Vcg(167),Vcg(168),Vcg(169),Vcg(170),Vcg(171), + Vcg(172),Vcg(173),Vcg(174),Vcg(175),Vcg(176),Vcg(177),Vcg(178),Vcg(179), + Vcg(180),Vcg(181),Vcg(182),Vcg(183),Vcg(184),Vcg(185),Vcg(186),Vcg(187), + Vcg(188),Vcg(189),Vcg(190),Vcg(191),Vcg(192),Vcg(193),Vcg(194),Vcg(195), + Vcg(196),Vcg(197),Vcg(198),Vcg(199),Vcg(200),Vcg(201),Vcg(202),Vcg(203), + Vcg(204),Vcg(205),Vcg(206),Vcg(207),Vcg(208),Vcg(209),Vcg(210),Vcg(211), + Vcg(212),Vcg(213),Vcg(214),Vcg(215),Vcg(216),Vcg(217),Vcg(218),Vcg(219), + Vcg(220),Vcg(221),Vcg(222),Vcg(223),Vcg(224),Vcg(225),Vcg(226),Vcg(227), + Vcg(228),Vcg(229),Vcg(230),Vcg(231),Vcg(232),Vcg(233),Vcg(234),Vcg(235), + Vcg(236),Vcg(237),Vcg(238),Vcg(239),Vcg(240),Vcg(241),Vcg(242),Vcg(243), + Vcg(244),Vcg(245),Vcg(246),Vcg(247),Vcg(248),Vcg(249),Vcg(250),Vcg(251), + Vcg(252),Vcg(253),Vcg(254),Vcg(255)}; + +} // namespace libyuv +#endif + diff --git a/source/convert.cc b/source/convert.cc new file mode 100644 index 000000000..76c7cf274 --- /dev/null +++ b/source/convert.cc @@ -0,0 +1,1467 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "convert.h" + +#include // memcpy(), memset() +#include +#include // abs + +//#define SCALEOPT //Currently for windows only. June 2010 + +#ifdef SCALEOPT +#include +#endif + +#include "conversion_tables.h" + +namespace libyuv +{ + + +// Clip value to [0,255] +inline uint8 Clip(int32 val); + +#ifdef SCALEOPT +void *memcpy_16(void * dest, const void * src, size_t n); +void *memcpy_8(void * dest, const void * src, size_t n); +#endif + + +int +ConvertI420ToRGB24(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ) +{ + if (src_width == 0 || src_height == 0 || src_ystride == 0 || + src_ustride == 0 || src_vstride == 0 || dst_stride == 0) + { + return -1; + } + + // RGB orientation - bottom up + uint8* out = dst_frame + dst_stride * src_height * 3 - dst_stride * 3; + uint8* out2 = out - dst_stride * 3; + int h, w; + int tmpR, tmpG, tmpB; + const uint8 *y1, *y2 ,*u, *v; + y1 = src_yplane; + y2 = y1 + src_ystride; + u = src_uplane; + v = src_vplane; + for (h = (src_height >> 1); h > 0; h--) + { + // 2 rows at a time, 2 y's at a time + for (w = 0; w < (src_width >> 1); w++) + { + // Vertical and horizontal sub-sampling + tmpR = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8); + out[2] = Clip(tmpR); + out[1] = Clip(tmpG); + out[0] = Clip(tmpB); + + tmpR = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8); + out2[2] = Clip(tmpR); + out2[1] = Clip(tmpG); + out2[0] = Clip(tmpB); + + tmpR = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8); + out[5] = Clip(tmpR); + out[4] = Clip(tmpG); + out[3] = Clip(tmpB); + + tmpR = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8); + out2[5] = Clip(tmpR); + out2[4] = Clip(tmpG); + out2[3] = Clip(tmpB); + + out += 6; + out2 += 6; + y1 += 2; + y2 += 2; + u++; + v++; + } + y1 += src_ystride + src_ystride - src_width; + y2 += src_ystride + src_ystride - src_width; + u += src_ustride - (src_width >> 1); + v += src_vstride - (src_width >> 1); + out -= dst_stride * 9; + out2 -= dst_stride * 9; + } // end height for + + return 0; +} + + +int +ConvertI420ToARGB(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ) +{ + if (src_width == 0 || src_height == 0 || src_ystride == 0 || + src_ustride == 0 || src_vstride == 0 || dst_stride == 0) + { + return -1; + } + + uint8* out1 = dst_frame; + uint8* out2 = out1 + dst_stride * 4; + const uint8 *y1,*y2, *u, *v; + y1 = src_yplane; + y2 = src_yplane + src_ystride; + u = src_uplane; + v = src_vplane; + int h, w; + int tmpR, tmpG, tmpB; + + for (h = (src_height >> 1); h > 0; h--) + { + // Do 2 rows at the time + for (w = 0; w < (src_width >> 1); w++) + { // Vertical and horizontal sub-sampling + + tmpR = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8); + out1[3] = 0xff; + out1[2] = Clip(tmpR); + out1[1] = Clip(tmpG); + out1[0] = Clip(tmpB); + + tmpR = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8); + out2[3] = 0xff; + out2[2] = Clip(tmpR); + out2[1] = Clip(tmpG); + out2[0] = Clip(tmpB); + + tmpR = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8); + out1[7] = 0xff; + out1[6] = Clip(tmpR); + out1[5] = Clip(tmpG); + out1[4] = Clip(tmpB); + + tmpR = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8); + out2[7] = 0xff; + out2[6] = Clip(tmpR); + out2[5] = Clip(tmpG); + out2[4] = Clip(tmpB); + + out1 += 8; + out2 += 8; + y1 += 2; + y2 += 2; + u++; + v++; + } + y1 += 2 * src_ystride - src_width; + y2 += 2 * src_ystride - src_width; + u += src_ustride - (src_width >> 1); + v += src_vstride - (src_width >> 1); + out1 += (2 * dst_stride - src_width) * 4; + out2 += (2 * dst_stride - src_width) * 4; + } // end height for + return 0; +} + + +int +ConvertYV12ToRGBA(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ) +{ + if (src_width == 0 || src_height == 0 || src_ystride == 0 || + src_ustride == 0 || src_vstride == 0 || dst_stride == 0) + { + return -1; + } + + int32 diff = dst_stride - src_width; + + uint8 * out = dst_frame; + uint8 * out2 = out + dst_stride * 4; + const uint8 *y1,*y2, *u, *v; + int tmpG, tmpB, tmpR; + int h, w; + y1 = src_yplane; + y2 = y1 + src_ystride; + + v = src_vplane; + u = src_uplane; + + for (h = (src_height >> 1); h > 0; h--) + { + // Do 2 rows at the time + for (w = 0; w < (src_width >> 1); w++) + { + tmpR = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128 ) >> 8); + tmpG = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128 ) >> 8); + out[1] = Clip(tmpR); + out[2] = Clip(tmpG); + out[3] = Clip(tmpB); + + tmpR = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8); + out2[1] = Clip(tmpR); + out2[2] = Clip(tmpG); + out2[3] = Clip(tmpB); + + tmpR = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8); + out[5] = Clip(tmpR); + out[6] = Clip(tmpG); + out[7] = Clip(tmpB); + + tmpR = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8); + out2[5] = Clip(tmpR); + out2[6] = Clip(tmpG); + out2[7] = Clip(tmpB); + + out[0] = 0xff; + out[4] = 0xff; + out += 8; + out2[0] = 0xff; + out2[4] = 0xff; + out2 += 8; + y1 += 2; + y2 += 2; + u++; + v++; + } + + y1 += 2 * src_ystride - src_width; + y2 += 2 * src_ystride - src_width; + u += src_ustride - (src_width >> 1); + v += src_vstride - (src_width >> 1); + out += (src_width + diff * 2) * 4; + out2 += (src_width + diff * 2) * 4; + } + return 0; +} + + +// Little Endian... +int +ConvertI420ToARGB4444(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ) +{ + if (src_width == 0 || src_height == 0 || src_ystride == 0 || + src_ustride == 0 || src_vstride == 0 || dst_stride == 0) + { + return -1; + } + // RGB orientation - bottom up + uint8* out = dst_frame + dst_stride * (src_height - 1) * 2; + uint8* out2 = out - (2 * dst_stride); + int tmpR, tmpG, tmpB; + const uint8 *y1,*y2, *u, *v; + y1 = src_yplane; + y2 = y1 + src_ystride; + u = src_uplane; + v = src_vplane; + int h, w; + + for (h = (src_height >> 1); h > 0; h--) + { + // 2 rows at a time, 2 y's at a time + for (w = 0; w < (src_width >> 1); w++) + { + // Vertical and horizontal sub-sampling + // Convert to RGB888 and re-scale to 4 bits + tmpR = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8); + out[0] =(uint8)((Clip(tmpG) & 0xf0) + (Clip(tmpB) >> 4)); + out[1] = (uint8)(0xf0 + (Clip(tmpR) >> 4)); + + tmpR = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8); + out2[0] = (uint8)((Clip(tmpG) & 0xf0 ) + (Clip(tmpB) >> 4)); + out2[1] = (uint8) (0xf0 + (Clip(tmpR) >> 4)); + + tmpR = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8); + out[2] = (uint8)((Clip(tmpG) & 0xf0 ) + (Clip(tmpB) >> 4)); + out[3] = (uint8)(0xf0 + (Clip(tmpR) >> 4)); + + tmpR = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8); + out2[2] = (uint8)((Clip(tmpG) & 0xf0 ) + (Clip(tmpB) >> 4)); + out2[3] = (uint8)(0xf0 + (Clip(tmpR) >> 4)); + + out += 4; + out2 += 4; + y1 += 2; + y2 += 2; + u++; + v++; + } + y1 += 2 * src_ystride - src_width; + y2 += 2 * src_ystride - src_width; + u += src_ustride - (src_width >> 1); + v += src_vstride - (src_width >> 1); + out -= (2 * dst_stride + src_width) * 2; + out2 -= (2 * dst_stride + src_width) * 2; + } // end height for + + return 0; +} + + +int +ConvertI420ToRGB565(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ) +{ + if (src_width == 0 || src_height == 0 || src_ystride == 0 || + src_ustride == 0 || src_vstride == 0 || dst_stride == 0) + { + return -1; + } + + uint16* out = (uint16*)(dst_frame) + dst_stride * (src_height - 1); + uint16* out2 = out - dst_stride; + int tmpR, tmpG, tmpB; + const uint8 *y1,*y2, *u, *v; + y1 = src_yplane; + y2 = y1 + src_ystride; + u = src_uplane; + v = src_vplane; + int h, w; + + for (h = (src_height >> 1); h > 0; h--) + { + // 2 rows at a time, 2 y's at a time + for (w = 0; w < (src_width >> 1); w++) + { + // Vertical and horizontal sub-sampling + // 1. Convert to RGB888 + // 2. Shift to adequate location (in the 16 bit word) - RGB 565 + + tmpR = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8); + out[0] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG) + & 0xfc) << 3) + (Clip(tmpB) >> 3); + + tmpR = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8); + out2[0] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG) + & 0xfc) << 3) + (Clip(tmpB) >> 3); + + tmpR = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8); + out[1] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG) + & 0xfc) << 3) + (Clip(tmpB ) >> 3); + + tmpR = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8); + out2[1] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG) + & 0xfc) << 3) + (Clip(tmpB) >> 3); + + y1 += 2; + y2 += 2; + out += 2; + out2 += 2; + u++; + v++; + } + y1 += 2 * src_ystride - src_width; + y2 += 2 * src_ystride - src_width; + u += src_ustride - (src_width >> 1); + v += src_vstride - (src_width >> 1); + out -= 2 * dst_stride + src_width; + out2 -= 2 * dst_stride + src_width; + } // end height for + + return 0; +} + + +//Same as ConvertI420ToRGB565 but doesn't flip vertically. +int +ConvertI420ToRGB565Android(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ) +{ + if (src_width == 0 || src_height == 0 || src_ystride == 0 || + src_ustride == 0 || src_vstride == 0 || dst_stride == 0) + { + return -1; + } + + uint16* out = (uint16*)(dst_frame); + uint16* out2 = out + dst_stride; + int tmpR, tmpG, tmpB; + const uint8 *y1,*y2, *u, *v; + int h, w; + y1 = src_yplane; + y2 = y1 + src_ystride; + u = src_uplane; + v = src_vplane; + + for (h = (src_height >> 1); h > 0; h--) + { + // 2 rows at a time, 2 y's at a time + for (w = 0; w < (src_width >> 1); w++) + { + // Vertical and horizontal sub-sampling + // 1. Convert to RGB888 + // 2. Shift to adequate location (in the 16 bit word) - RGB 565 + + tmpR = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8); + out[0] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG) + & 0xfc) << 3) + (Clip(tmpB) >> 3); + + tmpR = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8); + out2[0] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG) + & 0xfc) << 3) + (Clip(tmpB) >> 3); + + tmpR = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8); + out[1] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG) + & 0xfc) << 3) + (Clip(tmpB ) >> 3); + + tmpR = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8); + out2[1] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG) + & 0xfc) << 3) + (Clip(tmpB) >> 3); + + y1 += 2; + y2 += 2; + out += 2; + out2 += 2; + u++; + v++; + } + y1 += 2 * src_ystride - src_width; + y2 += 2 * src_ystride - src_width; + u += src_ustride - (src_width >> 1); + v += src_vstride - (src_width >> 1); + out += 2 * dst_stride - src_width; + out2 += 2 * dst_stride - src_width; + } // end height for + + return 0; +} + + +int +ConvertI420ToARGB1555(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ) +{ + if (src_width == 0 || src_height == 0 || src_ystride == 0 || + src_ustride == 0 || src_vstride == 0 || dst_stride == 0) + { + return -1; + } + + uint16* out = (uint16*)(dst_frame) + dst_stride * (src_height - 1); + uint16* out2 = out - dst_stride ; + int32 tmpR, tmpG, tmpB; + const uint8 *y1,*y2, *u, *v; + int h, w; + + y1 = src_yplane; + y2 = y1 + src_ystride; + u = src_uplane; + v = src_vplane; + + for (h = (src_height >> 1); h > 0; h--) + { // 2 rows at a time, 2 y's at a time + for (w = 0; w < (src_width >> 1); w++) + { + // Vertical and horizontal sub-sampling + // 1. Convert to RGB888 + // 2. Shift to adequate location (in the 16 bit word) - RGB 555 + // 3. Add 1 for alpha value + tmpR = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8); + out[0] = (uint16)(0x8000 + ((Clip(tmpR) & 0xf8) << 10) + + ((Clip(tmpG) & 0xf8) << 3) + (Clip(tmpB) >> 3)); + + tmpR = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8); + out2[0] = (uint16)(0x8000 + ((Clip(tmpR) & 0xf8) << 10) + + ((Clip(tmpG) & 0xf8) << 3) + (Clip(tmpB) >> 3)); + + tmpR = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8); + out[1] = (uint16)(0x8000 + ((Clip(tmpR) & 0xf8) << 10) + + ((Clip(tmpG) & 0xf8) << 3) + (Clip(tmpB) >> 3)); + + tmpR = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8); + out2[1] = (uint16)(0x8000 + ((Clip(tmpR) & 0xf8) << 10) + + ((Clip(tmpG) & 0xf8) << 3) + (Clip(tmpB) >> 3)); + + y1 += 2; + y2 += 2; + out += 2; + out2 += 2; + u++; + v++; + } + y1 += 2 * src_ystride - src_width; + y2 += 2 * src_ystride - src_width; + u += src_ustride - (src_width >> 1); + v += src_vstride - (src_width >> 1); + out -= 2 * dst_stride + src_width; + out2 -= 2 * dst_stride + src_width; + } // end height for + return 0; +} + + +int +ConvertI420ToYUY2(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ) +{ + if (src_width == 0 || src_height == 0 || src_ystride == 0 || + src_ustride == 0 || src_vstride == 0 || dst_stride == 0) + { + return -1; + } + + const uint8* in1 = src_yplane; + const uint8* in2 = src_yplane + src_ystride ; + const uint8* inU = src_uplane; + const uint8* inV = src_vplane; + + uint8* out1 = dst_frame; + uint8* out2 = dst_frame + 2 * dst_stride; + + // YUY2 - Macro-pixel = 2 image pixels + // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... +#ifndef SCALEOPT + for (int i = 0; i < (src_height >> 1);i++) + { + for (int j = 0; j < (src_width >> 1);j++) + { + out1[0] = in1[0]; + out1[1] = *inU; + out1[2] = in1[1]; + out1[3] = *inV; + + out2[0] = in2[0]; + out2[1] = *inU; + out2[2] = in2[1]; + out2[3] = *inV; + out1 += 4; + out2 += 4; + inU++; + inV++; + in1 += 2; + in2 += 2; + } + in1 += 2 * src_ystride - src_width; + in2 += 2 * src_ystride - src_width; + inU += src_ustride - (src_width >> 1); + inV += src_vstride - (src_width >> 1); + out1 += 2 * dst_stride + 2 * (dst_stride - src_width); + out2 += 2 * dst_stride + 2 * (dst_stride - src_width); + } +#else + for (WebRtc_UWord32 i = 0; i < (height >> 1);i++) + { + int32 width__ = (width >> 4); + _asm + { + ;pusha + mov eax, DWORD PTR [in1] ;1939.33 + mov ecx, DWORD PTR [in2] ;1939.33 + mov ebx, DWORD PTR [inU] ;1939.33 + mov edx, DWORD PTR [inV] ;1939.33 + loop0: + movq xmm6, QWORD PTR [ebx] ;inU + movq xmm0, QWORD PTR [edx] ;inV + punpcklbw xmm6, xmm0 ;inU, inV mix + ;movdqa xmm1, xmm6 + ;movdqa xmm2, xmm6 + ;movdqa xmm4, xmm6 + + movdqu xmm3, XMMWORD PTR [eax] ;in1 + movdqa xmm1, xmm3 + punpcklbw xmm1, xmm6 ;in1, inU, in1, inV + mov esi, DWORD PTR [out1] + movdqu XMMWORD PTR [esi], xmm1 ;write to out1 + + movdqu xmm5, XMMWORD PTR [ecx] ;in2 + movdqa xmm2, xmm5 + punpcklbw xmm2, xmm6 ;in2, inU, in2, inV + mov edi, DWORD PTR [out2] + movdqu XMMWORD PTR [edi], xmm2 ;write to out2 + + punpckhbw xmm3, xmm6 ;in1, inU, in1, inV again + movdqu XMMWORD PTR [esi+16], xmm3 ;write to out1 again + add esi, 32 + mov DWORD PTR [out1], esi + + punpckhbw xmm5, xmm6 ;inU, in2, inV again + movdqu XMMWORD PTR [edi+16], xmm5 ;write to out2 again + add edi, 32 + mov DWORD PTR [out2], edi + + add ebx, 8 + add edx, 8 + add eax, 16 + add ecx, 16 + + mov esi, DWORD PTR [width__] + sub esi, 1 + mov DWORD PTR [width__], esi + jg loop0 + + mov DWORD PTR [in1], eax ;1939.33 + mov DWORD PTR [in2], ecx ;1939.33 + mov DWORD PTR [inU], ebx ;1939.33 + mov DWORD PTR [inV], edx ;1939.33 + + ;popa + emms + } + in1 += 2 * src_ystride - src_width; + in2 += 2 * src_ystride - src_width; + out1 += 2 * strideOut + 2 * (strideOut - width); + out2 += 2 * strideOut + 2 * (strideOut - width); + } +#endif + return 0; +} + +int +ConvertI420ToUYVY(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ) +{ + if (src_width == 0 || src_height == 0 || src_ystride == 0 || + src_ustride == 0 || src_vstride == 0 || dst_stride == 0) + { + return -1; + } + int i = 0; + const uint8* y1 = src_yplane; + const uint8* y2 = y1 + src_ystride; + const uint8* u = src_uplane; + const uint8* v = src_vplane; + + uint8* out1 = dst_frame; + uint8* out2 = dst_frame + 2 * dst_stride; + + // Macro-pixel = 2 image pixels + // U0Y0V0Y1....U2Y2V2Y3...U4Y4V4Y5..... + +#ifndef SCALEOPT + for (; i < (src_height >> 1);i++) + { + for (uint32 j = 0; j < (src_width >> 1) ;j++) + { + out1[0] = *u; + out1[1] = y1[0]; + out1[2] = *v; + out1[3] = y1[1]; + + out2[0] = *u; + out2[1] = y2[0]; + out2[2] = *v; + out2[3] = y2[1]; + out1 += 4; + out2 += 4; + u++; + v++; + y1 += 2; + y2 += 2; + } + y1 += 2 * src_ystride - src_width; + y2 += 2 * src_ystride - src_width; + u += src_ustride - (src_width >> 1); + v += src_vstride - (src_width >> 1); + out1 += 2 * (dst_stride + (dst_stride - src_width)); + out2 += 2 * (dst_stride + (dst_stride - src_width)); + } +#else + for (; i < (height >> 1);i++) + { + int32 width__ = (width >> 4); + _asm + { + ;pusha + mov eax, DWORD PTR [in1] ;1939.33 + mov ecx, DWORD PTR [in2] ;1939.33 + mov ebx, DWORD PTR [inU] ;1939.33 + mov edx, DWORD PTR [inV] ;1939.33 +loop0: + movq xmm6, QWORD PTR [ebx] ;inU + movq xmm0, QWORD PTR [edx] ;inV + punpcklbw xmm6, xmm0 ;inU, inV mix + movdqa xmm1, xmm6 + movdqa xmm2, xmm6 + movdqa xmm4, xmm6 + + movdqu xmm3, XMMWORD PTR [eax] ;in1 + punpcklbw xmm1, xmm3 ;inU, in1, inV + mov esi, DWORD PTR [out1] + movdqu XMMWORD PTR [esi], xmm1 ;write to out1 + + movdqu xmm5, XMMWORD PTR [ecx] ;in2 + punpcklbw xmm2, xmm5 ;inU, in2, inV + mov edi, DWORD PTR [out2] + movdqu XMMWORD PTR [edi], xmm2 ;write to out2 + + punpckhbw xmm4, xmm3 ;inU, in1, inV again + movdqu XMMWORD PTR [esi+16], xmm4 ;write to out1 again + add esi, 32 + mov DWORD PTR [out1], esi + + punpckhbw xmm6, xmm5 ;inU, in2, inV again + movdqu XMMWORD PTR [edi+16], xmm6 ;write to out2 again + add edi, 32 + mov DWORD PTR [out2], edi + + add ebx, 8 + add edx, 8 + add eax, 16 + add ecx, 16 + + mov esi, DWORD PTR [width__] + sub esi, 1 + mov DWORD PTR [width__], esi + jg loop0 + + mov DWORD PTR [in1], eax ;1939.33 + mov DWORD PTR [in2], ecx ;1939.33 + mov DWORD PTR [inU], ebx ;1939.33 + mov DWORD PTR [inV], edx ;1939.33 + + ;popa + emms + } + in1 += width; + in2 += width; + out1 += 2 * (strideOut + (strideOut - width)); + out2 += 2 * (strideOut + (strideOut - width)); + } +#endif + return 0; +} + +int +ConvertI420ToYV12(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ) +{ + if (src_width == 0 || src_height == 0 || src_ystride == 0 || + src_ustride == 0 || src_vstride == 0 || dst_stride == 0) + { + return -1; + } + const uint8* inFrame = src_yplane; + uint8* outFrame = dst_frame; + // Copy Y + for (int i = 0; i < src_height; i++) + { +#ifndef SCALEOPT + memcpy(outFrame, inFrame, src_width); +#else + memcpy_16(outFrame, inFrame, src_width); +#endif + inFrame += src_ystride; + outFrame += dst_stride; + } + // Copy U + inFrame = src_uplane; + outFrame += (dst_stride >> 1) * src_height >> 1; + for (uint32 i = 0; i < src_height >>1; i++) + { +#ifndef SCALEOPT + memcpy(outFrame, inFrame, src_width >> 1); +#else + memcpy_8(outFrame, inFrame, src_width >> 1); +#endif + inFrame += src_ustride; + outFrame += dst_stride >> 1; + } + outFrame -= dst_stride * src_height >> 1; + // Copy V + for (uint32 i = 0; i < src_height >> 1; i++) + { +#ifndef SCALEOPT + memcpy(outFrame, inFrame, src_width >> 1); +#else + memcpy_8(outFrame, inFrame, width >> 1); +#endif + inFrame += src_vstride; + outFrame += dst_stride >> 1; + } + return 0; +} + + +int +ConvertNv12ToRGB565(const uint8* src_yplane, int src_ystride, + const uint8* src_uvplane, int src_uvstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ) +{ + if (src_height < 1 || src_height < 1 || src_ystride < 1 || src_uvstride < 1) + { + return -1; + } + + // Bi-Planar: Y plane followed by an interlaced U and V plane + const uint8* interlacedSrc = src_uvplane; + uint16* out = (uint16*)(src_yplane) + dst_stride * (src_height - 1); + uint16* out2 = out - dst_stride; + int32 tmpR, tmpG, tmpB; + const uint8 *y1,*y2; + y1 = src_yplane; + y2 = y1 + src_ystride; + int h, w; + + for (h = (src_height >> 1); h > 0; h--) + { + // 2 rows at a time, 2 y's at a time + for (w = 0; w < (src_width >> 1); w++) + { + // Vertical and horizontal sub-sampling + // 1. Convert to RGB888 + // 2. Shift to adequate location (in the 16 bit word) - RGB 565 + + tmpR = (int32)((mapYc[y1[0]] + mapVcr[interlacedSrc[1]] + + 128) >> 8); + tmpG = (int32)((mapYc[y1[0]] + mapUcg[interlacedSrc[0]] + + mapVcg[interlacedSrc[1]] + 128) >> 8); + tmpB = (int32)((mapYc[y1[0]] + mapUcb[interlacedSrc[0]] + + 128) >> 8); + out[0] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG) + & 0xfc) << 3) + (Clip(tmpB) >> 3); + + tmpR = (int32)((mapYc[y2[0]] + mapVcr[interlacedSrc[1]] + + 128) >> 8); + tmpG = (int32)((mapYc[y2[0]] + mapUcg[interlacedSrc[0]] + + mapVcg[interlacedSrc[1]] + 128) >> 8); + tmpB = (int32)((mapYc[y2[0]] + mapUcb[interlacedSrc[0]] + + 128) >> 8); + out2[0] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG) + & 0xfc) << 3) + (Clip(tmpB) >> 3); + + tmpR = (int32)((mapYc[y1[1]] + mapVcr[interlacedSrc[1]] + + 128) >> 8); + tmpG = (int32)((mapYc[y1[1]] + mapUcg[interlacedSrc[0]] + + mapVcg[interlacedSrc[1]] + 128) >> 8); + tmpB = (int32)((mapYc[y1[1]] + mapUcb[interlacedSrc[0]] + + 128) >> 8); + out[1] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG) + & 0xfc) << 3) + (Clip(tmpB ) >> 3); + + tmpR = (int32)((mapYc[y2[1]] + mapVcr[interlacedSrc[1]] + + 128) >> 8); + tmpG = (int32)((mapYc[y2[1]] + mapUcg[interlacedSrc[0]] + + mapVcg[interlacedSrc[1]] + 128) >> 8); + tmpB = (int32)((mapYc[y2[1]] + mapUcb[interlacedSrc[0]] + + 128) >> 8); + out2[1] = (uint16)((Clip(tmpR) & 0xf8) << 8) + ((Clip(tmpG) + & 0xfc) << 3) + (Clip(tmpB) >> 3); + + y1 += 2; + y2 += 2; + out += 2; + out2 += 2; + interlacedSrc += 2; + } + y1 += 2 * src_ystride - src_width; + y2 += 2 * src_ystride - src_width; + interlacedSrc += src_uvstride - (src_width >> 1); + out -= 3 * dst_stride + dst_stride - src_width; + out2 -= 3 * dst_stride + dst_stride - src_width; + } // end height for + + return 0; +} + +int +ConvertI420ToABGR(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ) +{ + if (src_width == 0 || src_height == 0 || src_ystride == 0 || + src_ustride == 0 || src_vstride == 0 || dst_stride == 0) + { + return -1; + } + + // RGB orientation - bottom up + uint8* out = dst_frame + 4 * dst_stride * (src_height - 1); + uint8* out2 = out - dst_stride * 4; + int32 tmpR, tmpG, tmpB; + const uint8 *y1,*y2, *u, *v; + int h, w; + + y1 = src_yplane; + y2 = y1 + src_ystride; + u = src_uplane; + v = src_vplane; + + for (h = (src_height >> 1); h > 0; h--) + { + // 2 rows at a time, 2 y's at a time + for (w = 0; w < (src_width >> 1); w++) + { + // Vertical and horizontal sub-sampling + tmpR = (int32)((298 * (y1[0] - 16) + 409 * (v[0] - 128) + + 128) >> 8); + tmpG = (int32)((298 * (y1[0] - 16) - 100 * (u[0] - 128) + - 208 * (v[0] - 128) + 128 ) >> 8); + tmpB = (int32)((298 * (y1[0] - 16) + 516 * (u[0] - 128) + + 128 ) >> 8); + + out[3] = 0xff; + out[0] = Clip(tmpR); + out[1] = Clip(tmpG); + out[2] = Clip(tmpB); + + tmpR = (int32)((298 * (y2[0] - 16) + 409 * (v[0] - 128) + + 128) >> 8); + tmpG = (int32)((298 * (y2[0] - 16) - 100 * (u[0] - 128) + - 208 * (v[0] - 128) + 128) >> 8); + tmpB = (int32)((298 * (y2[0] - 16) + 516 * (u[0] - 128) + + 128) >> 8); + + out2[3] = 0xff; + out2[0] = Clip(tmpR); + out2[1] = Clip(tmpG); + out2[2] = Clip(tmpB); + + tmpR = (int32)((298 * (y1[1] - 16) + 409 * (v[0] - 128) + + 128 ) >> 8); + tmpG = (int32)((298 * (y1[1] - 16) - 100 * (u[0] - 128) + - 208 * (v[0] - 128) + 128 ) >> 8); + tmpB = (int32)((298 * (y1[1] - 16) + 516 * (u[0] - 128) + + 128) >> 8); + + out[7] = 0xff; + out[4] = Clip(tmpR); + out[5] = Clip(tmpG); + out[6] = Clip(tmpB); + + tmpR = (int32)((298 * (y2[1] - 16) + 409 * (v[0] - 128) + + 128) >> 8); + tmpG = (int32)((298 * (y2[1] - 16) - 100 * (u[0] - 128) + - 208 * (v[0] - 128) + 128) >> 8); + tmpB = (int32)((298 * (y2[1] - 16) + 516 * (u[0] - 128) + + 128 ) >> 8); + + out2[7] = 0xff; + out2[4] = Clip(tmpR); + out2[5] = Clip(tmpG); + out2[6] = Clip(tmpB); + + out += 8; + out2 += 8; + y1 += 2; + y2 += 2; + u++; + v++; + } + + y1 += src_ystride + src_ystride - src_width; + y2 += src_ystride + src_ystride - src_width; + u += src_ustride - (src_width >> 1); + v += src_vstride - (src_width >> 1); + out -= (2 * dst_stride + src_width) * 4; + out2 -= (2 * dst_stride + src_width) * 4; + } // end height for + return 0; +} + + +int +ConvertUYVYToI420(const uint8* src_frame, int src_stride, + uint8* dst_yplane, int dst_ystride, + uint8* dst_uplane, int dst_ustride, + uint8* dst_vplane, int dst_vstride, + int src_width, + int src_height + ) +{ + if (src_width == 0 || src_height == 0 || dst_ystride == 0 || + dst_ustride == 0 || dst_vstride == 0 || src_stride == 0) + { + return -1; + } + + int i, j; + uint8* outI = dst_yplane; + uint8* outU = dst_uplane; + uint8* outV = dst_vplane; + + // U0Y0V0Y1..U2Y2V2Y3..... + + for (i = 0; i < (src_height >> 1); i++) + { + for (j = 0; j < (src_width >> 1); j++) + { + outI[0] = src_frame[1]; + *outU = src_frame[0]; + outI[1] = src_frame[3]; + *outV = src_frame[2]; + src_frame += 4; + outI += 2; + outU++; + outV++; + } + for (j = 0; j < (src_width >> 1); j++) + { + outI[0] = src_frame[1]; + outI[1] = src_frame[3]; + src_frame += 4; + outI += 2; + } + outI += dst_ystride - src_width; + outU += dst_ustride - src_width << 1; + outV += dst_vstride - src_width << 1; + } + return 0; +} + + +int +ConvertRGB24ToARGB(const uint8* src_frame, int src_stride, + uint8* dst_frame, int dst_stride, + int src_width, int src_height + ) +{ + if (src_width < 1 || src_height < 1 || dst_stride < 1) + { + return -1; + } + + int i, j, offset; + uint8* outFrame = dst_frame; + const uint8* inFrame = src_frame; + + outFrame += dst_stride * (src_height - 1) * 4; + for (i = 0; i < src_height; i++) + { + for (j = 0; j < src_width; j++) + { + offset = j*4; + outFrame[0 + offset] = inFrame[0]; + outFrame[1 + offset] = inFrame[1]; + outFrame[2 + offset] = inFrame[2]; + outFrame[3 + offset] = 0xff; + inFrame += 3; + } + outFrame -= 4 * (dst_stride - src_width); + inFrame += src_stride - src_width; + } + return 0; +} + + +int +ConvertRGB24ToI420(const uint8* src_frame, int src_stride, + uint8* dst_yplane, int dst_ystride, + uint8* dst_uplane, int dst_ustride, + uint8* dst_vplane, int dst_vstride, + int src_width, + int src_height + ) +{ + if (src_width == 0 || src_height == 0 || dst_ystride == 0 || + dst_ustride == 0 || dst_vstride == 0 || src_stride == 0) + { + return -1; + } + uint8* yStartPtr; + uint8* yStartPtr2; + uint8* uStartPtr; + uint8* vStartPtr; + const uint8* inpPtr; + const uint8* inpPtr2; + int h, w; + + // Assuming RGB in a bottom up orientation. + yStartPtr = dst_yplane; + yStartPtr2 = yStartPtr + dst_ystride; + uStartPtr = dst_uplane; + vStartPtr =dst_vplane; + inpPtr = src_frame + src_stride * src_height * 3 - 3 * src_height; + inpPtr2 = inpPtr - 3 * src_stride; + + for (h = 0; h < (src_height >> 1); h++ ) + { + for (w = 0; w < (src_width >> 1); w++) + { + // Y + yStartPtr[0] = (uint8)((66 * inpPtr[2] + 129 * inpPtr[1] + + 25 * inpPtr[0] + 128) >> 8) + 16; + yStartPtr2[0] = (uint8)((66 * inpPtr2[2] + 129 * inpPtr2[1] + + 25 * inpPtr2[0] + 128) >> 8) + 16; + // Moving to next column + yStartPtr[1] = (uint8)((66 * inpPtr[5] + 129 * inpPtr[4] + + 25 * inpPtr[3] + 128) >> 8) + 16; + yStartPtr2[1] = (uint8)((66 * inpPtr2[5] + 129 * inpPtr2[4] + + 25 * inpPtr2[3] + 128) >> 8 ) + 16; + // U + uStartPtr[0] = (uint8)((-38 * inpPtr[2] - 74 * inpPtr[1] + + 112 * inpPtr[0] + 128) >> 8) + 128; + // V + vStartPtr[0] = (uint8)((112 * inpPtr[2] -94 * inpPtr[1] - + 18 * inpPtr[0] + 128) >> 8) + 128; + + yStartPtr += 2; + yStartPtr2 += 2; + uStartPtr++; + vStartPtr++; + inpPtr += 6; + inpPtr2 += 6; + } // end for w + yStartPtr += dst_ystride + dst_ystride - src_width; + yStartPtr2 += dst_ystride + dst_ystride - src_width; + uStartPtr += dst_ustride + dst_ustride - (src_width >> 1); + vStartPtr += dst_vstride + dst_vstride - (src_width >> 1); + inpPtr -= 3 * (2 * src_stride + src_width); + inpPtr2 -= 3 * (2 * src_stride + src_width); + } // end for h + return 0; +} + + +int +ConvertYV12ToARGB(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_frame, int dst_stride, + int src_width, + int src_height + ) +{ + if (src_width == 0 || src_height == 0 || src_ystride == 0 || + src_ustride == 0 || src_vstride == 0 || dst_stride == 0) + { + return -1; + } + int32 diff = dst_stride - src_width; + uint8* out = dst_frame; + uint8* out2 = out + dst_stride * 4; + const uint8 *y1,*y2, *u, *v; + int h, w; + y1 = src_yplane; + y2 = y1 + src_ystride; + v = src_vplane; + u = src_uplane; + + int32 tmpG, tmpB, tmpR; + for (h = (src_height >> 1); h > 0; h--) + { + // 2 rows at a time + for (w = 0; w < (src_width >> 1); w++) + { + tmpR = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128 )>> 8); + tmpG = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128 )>> 8); + out[2] = Clip(tmpR); + out[1] = Clip(tmpG); + out[0] = Clip(tmpB); + + tmpR = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8); + out2[2] = Clip(tmpR); + out2[1] = Clip(tmpG); + out2[0] = Clip(tmpB); + + tmpR = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128)>> 8); + tmpG = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8); + out[6] = Clip(tmpR); + out[5] = Clip(tmpG); + out[4] = Clip(tmpB); + + tmpR = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8); + tmpG = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + + 128) >> 8); + tmpB = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8); + out2[6] = Clip(tmpR); + out2[5] = Clip(tmpG); + out2[4] = Clip(tmpB); + + out[3] = 0xff; + out[7] = 0xff; + out += 8; + out2[3] = 0xff; + out2[7] = 0xff; + out2 += 8; + y1 += 2; + y2 += 2; + u++; + v++; + } + + y1 += 2 * src_ystride - src_width; + y2 += 2 * src_ystride - src_width; + u += src_ustride - (src_width >> 1); + v += src_vstride - (src_width >> 1); + + out += 4 * (2 * dst_stride - src_width); + out2 += 4 * (2 * dst_stride - src_width); + } + return 0; +} + + +int +ConvertABGRToI420(const uint8* src_frame, int src_stride, + uint8* dst_yplane, int dst_ystride, + uint8* dst_uplane, int dst_ustride, + uint8* dst_vplane, int dst_vstride, + int src_width, + int src_height + ) +{ + if (src_width == 0 || src_height == 0 || dst_ystride == 0 || + dst_ustride == 0 || dst_vstride == 0 || src_stride == 0) + { + return -1; + } + + uint8* yStartPtr; + uint8* yStartPtr2; + uint8* uStartPtr; + uint8* vStartPtr; + const uint8* inpPtr; + const uint8* inpPtr2; + + yStartPtr = dst_yplane; + yStartPtr2 = yStartPtr + dst_ystride; + uStartPtr = dst_uplane; + vStartPtr = dst_vplane; + inpPtr = src_frame; + inpPtr2 = inpPtr + 4 * src_stride; + int h, w; + for (h = 0; h < (src_height >> 1); h++) + { + for (w = 0; w < (src_height >> 1); w++) + { + // Y + yStartPtr[0] = (uint8)((66 * inpPtr[1] + 129 * inpPtr[2] + + 25 * inpPtr[3] + 128) >> 8) + 16; + yStartPtr2[0] = (uint8)((66 * inpPtr2[1] + 129 * inpPtr2[2] + + 25 * inpPtr2[3] + 128) >> 8) + 16; + // Moving to next column + yStartPtr[1] = (uint8)((66 * inpPtr[5] + 129 * inpPtr[6] + + 25 * inpPtr[7] + 128) >> 8) + 16; + yStartPtr2[1] = (uint8)((66 * inpPtr2[5] + 129 * inpPtr2[6] + + 25 * inpPtr2[7] + 128) >> 8) + 16; + // U + uStartPtr[0] = (uint8)((-38 * inpPtr[1] - 74 * inpPtr[2] + + 112 * inpPtr[3] + 128) >> 8) + 128; + // V + vStartPtr[0] = (uint8)((112 * inpPtr[1] - 94 * inpPtr[2] + - 18 * inpPtr[3] + 128) >> 8) + 128; + + yStartPtr += 2; + yStartPtr2 += 2; + uStartPtr++; + vStartPtr++; + inpPtr += 8; + inpPtr2 += 8; + } + + yStartPtr += 2 * dst_ystride - src_width; + yStartPtr2 += 2 * dst_ystride - src_width; + uStartPtr += dst_ustride + dst_ustride - (src_width >> 1); + vStartPtr += dst_vstride + dst_vstride - (src_width >> 1); + inpPtr += 4 * (2 * src_stride - src_width); + inpPtr2 += 4 * (2 * src_stride - src_width); + } + return 0; +} + +inline +uint8 Clip(int32 val) +{ + if (val < 0) + { + return (uint8)0; + } else if (val > 255) + { + return (uint8)255; + } + return (uint8)val; +} + +#ifdef SCALEOPT +//memcpy_16 assumes that width is an integer multiple of 16! +void +*memcpy_16(void * dest, const void * src, size_t n) +{ + _asm + { + mov eax, dword ptr [src] + mov ebx, dword ptr [dest] + mov ecx, dword ptr [n] + + loop0: + + movdqu xmm0, XMMWORD PTR [eax] + movdqu XMMWORD PTR [ebx], xmm0 + add eax, 16 + add ebx, 16 + sub ecx, 16 + jg loop0 + } +} + +// memcpy_8 assumes that width is an integer multiple of 8! +void +*memcpy_8(void * dest, const void * src, size_t n) +{ + _asm + { + mov eax, dword ptr [src] + mov ebx, dword ptr [dest] + mov ecx, dword ptr [n] + + loop0: + + movq mm0, QWORD PTR [eax] + movq QWORD PTR [ebx], mm0 + add eax, 8 + add ebx, 8 + sub ecx, 8 + jg loop0 + + emms + } + +} + +#endif + +} // namespace libyuv + diff --git a/source/cpu_id.cc b/source/cpu_id.cc new file mode 100644 index 000000000..a538081d6 --- /dev/null +++ b/source/cpu_id.cc @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "cpu_id.h" + +#ifdef _MSC_VER +#include +#elif LINUX +#include "linux.h" +#endif + +// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux. +#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) +static inline void __cpuid(int cpu_info[4], int info_type) { + __asm__ volatile ( + "mov %%ebx, %%edi\n" + "cpuid\n" + "xchg %%edi, %%ebx\n" + : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type) + ); +} +#elif defined(__i386__) || defined(__x86_64__) +static inline void __cpuid(int cpu_info[4], int info_type) { + __asm__ volatile ( + "cpuid\n" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type) + ); +} +#endif + +namespace libyuv { + +// CPU detect function for SIMD instruction sets. +bool CpuInfo::cpu_info_initialized_ = false; +int CpuInfo::cpu_info_ = 0; +// Global lock for cpu initialization. + +#ifdef CPU_X86 +void cpuid(int cpu_info[4], int info_type) { + __cpuid(cpu_info, info_type); +} +#endif + +void CpuInfo::InitCpuFlags() { +#ifdef CPU_X86 + int cpu_info[4]; + __cpuid(cpu_info, 1); + cpu_info_ = (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) | + (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0); +#elif defined(__ARM_NEON__) + // gcc -mfpu=neon defines __ARM_NEON__ + // if code is specifically built for Neon-only, enable the flag. + cpu_info_ |= kCpuHasNEON; +#elif LINUX && defined(__arm__) + cpu_info_ = 0; + // Look for NEON support in /proc/cpuinfo + ProcCpuInfo proc_info; + size_t section_count; + if (proc_info.LoadFromSystem() && + proc_info.GetSectionCount(§ion_count)) { + for (size_t i = 0; i < section_count; ++i) { + std::string out_features; + if (proc_info.GetSectionStringValue(i, "Features", &out_features)) { + if (out_features.find("neon") != std::string::npos) { + cpu_info_ |= kCpuHasNEON; + } + break; + } + } + } +#else + cpu_info_ = 0; +#endif + cpu_info_initialized_ = true; +} + +void CpuInfo::MaskCpuFlagsForTest(int enable_flags) { + InitCpuFlags(); + cpu_info_ &= enable_flags; +} + +bool CpuInfo::TestCpuFlag(int flag) { + if (!cpu_info_initialized_) { + InitCpuFlags(); + } + return cpu_info_ & flag ? true : false; +} + +// Returns the vendor string from the cpu, e.g. "GenuineIntel", "AuthenticAMD". +// See "Intel Processor Identification and the CPUID Instruction" +// (Intel document number: 241618) +std::string CpuInfo::GetCpuVendor() { +#ifdef CPU_X86 + int cpu_info[4]; + cpuid(cpu_info, 0); + cpu_info[0] = cpu_info[1]; // Reorder output + cpu_info[1] = cpu_info[3]; + cpu_info[2] = cpu_info[2]; + cpu_info[3] = 0; + return std::string(reinterpret_cast(&cpu_info[0])); +#else + return std::string("Undefined"); +#endif +} + +} // namespace libyuv diff --git a/source/cpu_id.h b/source/cpu_id.h new file mode 100644 index 000000000..2d1d1eeb7 --- /dev/null +++ b/source/cpu_id.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef LIBYUV_SOURCE_CPU_ID_H_ +#define LIBYUV_SOURCE_CPU_ID_H_ + +#include + +#include "basic_types.h" + +namespace libyuv { +#ifdef CPU_X86 +void cpuid(int cpu_info[4], int info_type); +#endif + +class CpuInfo { + public: + // These flags are only valid on x86 processors + static const int kCpuHasSSE2 = 1; + static const int kCpuHasSSSE3 = 2; + + // SIMD support on ARM processors + static const int kCpuHasNEON = 4; + + // Detect CPU has SSE2 etc. + static bool TestCpuFlag(int flag); + + // Detect CPU vendor: "GenuineIntel" or "AuthenticAMD" + static std::string GetCpuVendor(); + + // For testing, allow CPU flags to be disabled. + static void MaskCpuFlagsForTest(int enable_flags); + + private: + // Global lock for the cpu initialization + static bool cpu_info_initialized_; + static int cpu_info_; + + static void InitCpuFlags(); + + DISALLOW_IMPLICIT_CONSTRUCTORS(CpuInfo); +}; + +} // namespace libyuv + +#endif // LIBYUV_SOURCE_CPU_ID_H_ diff --git a/source/format_conversion.cc b/source/format_conversion.cc new file mode 100644 index 000000000..6a46681fa --- /dev/null +++ b/source/format_conversion.cc @@ -0,0 +1,488 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "format_conversion.h" + +#include "common.h" +#include "video_common.h" + +namespace libyuv { + +enum { + RED = 0, + BLUE = 1, + GREEN_BETWEEN_RED = 2, + GREEN_BETWEEN_BLUE = 3, +}; + +enum Position { + LEFT = 0, + RIGHT = 1, + TOP = 2, + BOTTOM = 4, + CENTER = 6, + + // Due to the choice of the above values, these are all distinct and the + // corner values and edge values are each contiguous. This allows us to + // figure out the position type of a pixel with a single addition operation + // using the above values, rather than having to use a 3x3 nested switch + // statement. + TOP_LEFT = TOP + LEFT, // 2 + TOP_RIGHT = TOP + RIGHT, // 3 + BOTTOM_LEFT = BOTTOM + LEFT, // 4 + BOTTOM_RIGHT = BOTTOM + RIGHT, // 5 + LEFT_EDGE = CENTER + LEFT, // 6 + RIGHT_EDGE = CENTER + RIGHT, // 7 + TOP_EDGE = TOP + CENTER, // 8 + BOTTOM_EDGE = BOTTOM + CENTER, // 10 + MIDDLE = CENTER + CENTER, // 12 +}; + +static FORCE_INLINE Position GetPosition(int x, int y, int width, int height) { + Position xpos = CENTER; + Position ypos = CENTER; + if (x == 0) { + xpos = LEFT; + } else if (x == width - 1) { + xpos = RIGHT; + } + if (y == 0) { + ypos = TOP; + } else if (y == height - 1) { + ypos = BOTTOM; + } + return static_cast(xpos + ypos); +} + +static FORCE_INLINE bool IsRedBlue(uint8 colour) { + return colour <= BLUE; +} + +static FORCE_INLINE uint32 FourCcToBayerPixelColourMap(uint32 fourcc) { + // The colour map is a 4-byte array-as-uint32 containing the colours for the + // four pixels in each 2x2 grid, in left-to-right and top-to-bottom order. + switch (fourcc) { + default: + ASSERT(false); + case FOURCC_RGGB: + return FOURCC(RED, GREEN_BETWEEN_RED, GREEN_BETWEEN_BLUE, BLUE); + case FOURCC_BGGR: + return FOURCC(BLUE, GREEN_BETWEEN_BLUE, GREEN_BETWEEN_RED, RED); + case FOURCC_GRBG: + return FOURCC(GREEN_BETWEEN_RED, RED, BLUE, GREEN_BETWEEN_BLUE); + case FOURCC_GBRG: + return FOURCC(GREEN_BETWEEN_BLUE, BLUE, RED, GREEN_BETWEEN_RED); + } +} + +static FORCE_INLINE void RGBToYUV(uint8 r, uint8 g, uint8 b, + uint8* y, uint8* u, uint8* v) { + // Taken from http://en.wikipedia.org/wiki/YUV + *y = (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16; + *u = ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; + *v = ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; +} + +static FORCE_INLINE void InterpolateBayerRGBCorner(uint8* r, + uint8* g, + uint8* b, + const uint8* src, + int src_pitch, + Position pos, + uint8 colour) { + + // Compute the offsets to use for fetching the adjacent pixels. + + int adjacent_row; + int adjacent_column; + switch (pos) { + case TOP_LEFT: + adjacent_row = src_pitch; + adjacent_column = 1; + break; + case TOP_RIGHT: + adjacent_row = src_pitch; + adjacent_column = -1; + break; + case BOTTOM_LEFT: + adjacent_row = -src_pitch; + adjacent_column = 1; + break; + case BOTTOM_RIGHT: + default: + adjacent_row = -src_pitch; + adjacent_column = -1; + break; + } + + // Now interpolate. + + if (IsRedBlue(colour)) { + uint8 current_pixel = src[0]; + // Average of the adjacent green pixels (there's only two). + *g = (src[adjacent_column] + src[adjacent_row]) / 2; + // Average of the oppositely-coloured corner pixels (there's only one). + uint8 corner_average = src[adjacent_row + adjacent_column]; + if (colour == RED) { + *r = current_pixel; + *b = corner_average; + } else { // i.e., BLUE + *b = current_pixel; + *r = corner_average; + } + } else { // i.e., GREEN_BETWEEN_* + *g = src[0]; + // Average of the adjacent same-row pixels (there's only one). + uint8 row_average = src[adjacent_column]; + // Average of the adjacent same-column pixels (there's only one). + uint8 column_average = src[adjacent_row]; + if (colour == GREEN_BETWEEN_RED) { + *r = row_average; + *b = column_average; + } else { // i.e., GREEN_BETWEEN_BLUE + *b = row_average; + *r = column_average; + } + } +} + +static FORCE_INLINE void InterpolateBayerRGBEdge(uint8* r, + uint8* g, + uint8* b, + const uint8* src, + int src_pitch, + Position pos, + uint8 colour) { + + // Compute the offsets to use for fetching the adjacent pixels. + + // Goes one pixel "in" to the image (i.e. towards the center) + int inner; + // Goes one pixel to the side (i.e. along the edge) in either the clockwise or + // counter-clockwise direction, and its negative value goes in the other + // direction. + int side; + + switch (pos) { + case TOP_EDGE: + inner = src_pitch; + side = 1; + break; + case RIGHT_EDGE: + inner = -1; + side = src_pitch; + break; + case BOTTOM_EDGE: + inner = -src_pitch; + side = 1; + break; + case LEFT_EDGE: + default: + inner = 1; + side = src_pitch; + break; + } + + // Now interpolate. + + if (IsRedBlue(colour)) { + uint8 current_pixel = src[0]; + // Average of the adjacent green pixels (there's only three). + *g = (src[inner] + src[side] + src[-side]) / 3; + // Average of the oppositely-coloured corner pixels (there's only two). + uint8 corner_average = (src[inner + side] + src[inner - side]) / 2; + if (colour == RED) { + *r = current_pixel; + *b = corner_average; + } else { // i.e., BLUE + *b = current_pixel; + *r = corner_average; + } + } else { // i.e., GREEN_BETWEEN_* + *g = src[0]; + // Average of the adjacent side-ways pixels (there's only two). + uint8 side_average = (src[side] + src[-side]) / 2; + // Average of the adjacent inner-ways pixels (there's only one). + uint8 inner_pixel = src[inner]; + // Including && side == 1 effectively transposes the colour logic for + // processing the left/right sides, which is needed since the "T" shape + // formed by the pixels is transposed. + if (colour == GREEN_BETWEEN_RED && side == 1) { + *r = side_average; + *b = inner_pixel; + } else { // i.e., GREEN_BETWEEN_BLUE || side != 1 + *b = side_average; + *r = inner_pixel; + } + } +} + +// We inline this one because it runs 99% of the time, so inlining it is +// probably beneficial. +static FORCE_INLINE void InterpolateBayerRGBCenter(uint8* r, + uint8* g, + uint8* b, + const uint8* src, + int src_pitch, + uint8 colour) { + + if (IsRedBlue(colour)) { + uint8 current_pixel = src[0]; + // Average of the adjacent green pixels (there's four). + // NOTE(tschmelcher): The material at + // http://www.siliconimaging.com/RGB%20Bayer.htm discusses a way to improve + // quality here by using only two of the green pixels based on the + // correlation to the nearby red/blue pixels, but that is slower and would + // result in more edge cases. + *g = (src[1] + src[-1] + src[src_pitch] + src[-src_pitch]) / 4; + // Average of the oppositely-coloured corner pixels (there's four). + uint8 corner_average = (src[src_pitch + 1] + + src[src_pitch - 1] + + src[-src_pitch + 1] + + src[-src_pitch - 1]) / 4; + if (colour == RED) { + *r = current_pixel; + *b = corner_average; + } else { // i.e., BLUE + *b = current_pixel; + *r = corner_average; + } + } else { // i.e., GREEN_BETWEEN_* + *g = src[0]; + // Average of the adjacent same-row pixels (there's two). + uint8 row_adjacent = (src[1] + src[-1]) / 2; + // Average of the adjacent same-column pixels (there's two). + uint8 column_adjacent = (src[src_pitch] + src[-src_pitch]) / 2; + if (colour == GREEN_BETWEEN_RED) { + *r = row_adjacent; + *b = column_adjacent; + } else { // i.e., GREEN_BETWEEN_BLUE + *b = row_adjacent; + *r = column_adjacent; + } + } +} + +// Converts any Bayer RGB format to I420. +void BayerRGBToI420(const uint8* src, int src_pitch, uint32 src_fourcc, + uint8* y, int y_pitch, + uint8* u, int u_pitch, + uint8* v, int v_pitch, + int width, int height) { + ASSERT(width % 2 == 0); + ASSERT(height % 2 == 0); + + uint32 colour_map = FourCcToBayerPixelColourMap(src_fourcc); + + int src_row_inc = src_pitch * 2 - width; + int y_row_inc = y_pitch * 2 - width; + int u_row_inc = u_pitch - width / 2; + int v_row_inc = v_pitch - width / 2; + + // Iterate over the 2x2 grids. + for (int y1 = 0; y1 < height; y1 += 2) { + for (int x1 = 0; x1 < width; x1 += 2) { + uint32 colours = colour_map; + int total_u = 0; + int total_v = 0; + // Iterate over the four pixels within them. + for (int y2 = 0; y2 < 2; ++y2) { + for (int x2 = 0; x2 < 2; ++x2) { + uint8 r, g, b; + // The low-order byte of the colour map is the current colour. + uint8 current_colour = static_cast(colours); + colours >>= 8; + Position pos = GetPosition(x1 + x2, y1 + y2, width, height); + const uint8* src_pixel = &src[y2 * src_pitch + x2]; + uint8* y_pixel = &y[y2 * y_pitch + x2]; + + // Convert from Bayer RGB to regular RGB. + + if (pos == MIDDLE) { + // 99% of the image is the middle. + InterpolateBayerRGBCenter(&r, &g, &b, + src_pixel, src_pitch, + current_colour); + } else if (pos >= LEFT_EDGE) { + // Next most frequent is edges. + InterpolateBayerRGBEdge(&r, &g, &b, + src_pixel, src_pitch, pos, + current_colour); + } else { + // Last is the corners. There are only 4. + InterpolateBayerRGBCorner(&r, &g, &b, + src_pixel, src_pitch, pos, + current_colour); + } + + // Convert from RGB to YUV. + + uint8 tmp_u, tmp_v; + RGBToYUV(r, g, b, y_pixel, &tmp_u, &tmp_v); + total_u += tmp_u; + total_v += tmp_v; + } + } + src += 2; + y += 2; + *u = total_u / 4; + *v = total_v / 4; + ++u; + ++v; + } + src += src_row_inc; + y += y_row_inc; + u += u_row_inc; + v += v_row_inc; + } +} + +// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers +// and vst would select which 2 components to write. The low level would need +// to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR + +#if defined(WIN32) && !defined(COVERAGE_ENABLED) +#define HAS_ARGBTOBAYERROW_SSSE3 +__declspec(naked) +static void ARGBToBayerRow_SSSE3(const uint8* src_argb, + uint8* dst_bayer, uint32 selector, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_bayer + movd xmm0, [esp + 12] // selector + mov ecx, [esp + 16] // pix + pshufd xmm0, xmm0, 0 + + wloop: + movdqa xmm1, [eax] + lea eax, [eax + 16] + pshufb xmm1, xmm0 + movd [edx], xmm1 + lea edx, [edx + 4] + sub ecx, 4 + ja wloop + ret + } +} + +#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \ + !TARGET_IPHONE_SIMULATOR + +#define HAS_ARGBTOBAYERROW_SSSE3 +extern "C" void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix); + asm( + ".text\n" +#if defined(OSX) + ".globl _ARGBToBayerRow_SSSE3\n" +"_ARGBToBayerRow_SSSE3:\n" +#else + ".global ARGBToBayerRow_SSSE3\n" +"ARGBToBayerRow_SSSE3:\n" +#endif + "mov 0x4(%esp),%eax\n" + "mov 0x8(%esp),%edx\n" + "movd 0xc(%esp),%xmm0\n" + "mov 0x10(%esp),%ecx\n" + "pshufd $0x0,%xmm0,%xmm0\n" + +"1:" + "movdqa (%eax),%xmm1\n" + "lea 0x10(%eax),%eax\n" + "pshufb %xmm0,%xmm1\n" + "movd %xmm1,(%edx)\n" + "lea 0x4(%edx),%edx\n" + "sub $0x4,%ecx\n" + "ja 1b\n" + "ret\n" +); +#endif + +static void ARGBToBayerRow_C(const uint8* src_argb, + uint8* dst_bayer, uint32 selector, int pix) { + int index0 = selector & 0xff; + int index1 = (selector >> 8) & 0xff; + // Copy a row of Bayer. + for (int x = 0; x < pix; x += 2) { + dst_bayer[0] = src_argb[index0]; + dst_bayer[1] = src_argb[index1]; + src_argb += 8; + dst_bayer += 2; + } +} + +// generate a selector mask useful for pshufb +static uint32 GenerateSelector(int select0, int select1) { + return static_cast(select0) | + static_cast((select1 + 4) << 8) | + static_cast((select0 + 8) << 16) | + static_cast((select1 + 12) << 24); +} + +// Converts any 32 bit ARGB to any Bayer RGB format. +void RGB32ToBayerRGB(const uint8* src_rgb, int src_pitch_rgb, + uint32 src_fourcc_rgb, + uint8* dst_bayer, int dst_pitch_bayer, + uint32 dst_fourcc_bayer, + int width, int height) { + ASSERT(width % 2 == 0); + void (*ARGBToBayerRow)(const uint8* src_argb, + uint8* dst_bayer, uint32 selector, int pix); +#if defined(HAS_ARGBTOBAYERROW_SSSE3) + if (CpuInfo::TestCpuFlag(CpuInfo::kCpuHasSSSE3) && + (width % 4 == 0) && + IS_ALIGNED(src_rgb, 16) && (src_pitch_rgb % 16 == 0) && + IS_ALIGNED(dst_bayer, 4) && (dst_pitch_bayer % 4 == 0)) { + ARGBToBayerRow = ARGBToBayerRow_SSSE3; + } else +#endif + { + ARGBToBayerRow = ARGBToBayerRow_C; + } + + ASSERT(src_fourcc_rgb == FOURCC_ARGB); + int blue_index = 0; + int green_index = 1; + int red_index = 2; + + // Now build a lookup table containing the indices for the four pixels in each + // 2x2 Bayer grid. + uint32 index_map[2]; + switch (dst_fourcc_bayer) { + default: + ASSERT(false); + case FOURCC_RGGB: + index_map[0] = GenerateSelector(red_index, green_index); + index_map[1] = GenerateSelector(green_index, blue_index); + break; + case FOURCC_BGGR: + index_map[0] = GenerateSelector(blue_index, green_index); + index_map[1] = GenerateSelector(green_index, red_index); + break; + case FOURCC_GRBG: + index_map[0] = GenerateSelector(green_index, red_index); + index_map[1] = GenerateSelector(blue_index, green_index); + break; + case FOURCC_GBRG: + index_map[0] = GenerateSelector(green_index, blue_index); + index_map[1] = GenerateSelector(red_index, green_index); + break; + } + + // Now convert. + for (int y = 0; y < height; ++y) { + ARGBToBayerRow(src_rgb, dst_bayer, index_map[y & 1], width); + src_rgb += src_pitch_rgb; + dst_bayer += dst_pitch_bayer; + } +} + +} // namespace libyuv diff --git a/source/general.cc b/source/general.cc new file mode 100644 index 000000000..13b628be0 --- /dev/null +++ b/source/general.cc @@ -0,0 +1,338 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "general.h" + +#include +#include // memcpy(), memset() + +#include "video_common.h" + +namespace libyuv { + + +int +MirrorI420LeftRight( const uint8* src_frame,uint8* dst_frame, + int src_width, int src_height) +{ + if (src_width < 1 || src_height < 1) + { + return -1; + } + + assert(src_width % 2 == 0 && src_height % 2 == 0); + + int indO = 0; + int indS = 0; + int wind, hind; + uint8 tmpVal; + // Will swap two values per iteration + const int halfW = src_width >> 1; + const int halfStride = src_width >> 1; + // Y + for (wind = 0; wind < halfW; wind++ ) + { + for (hind = 0; hind < src_height; hind++ ) + { + indO = hind * src_width + wind; + indS = hind * src_width + (src_width - wind - 1); // swapping index + tmpVal = src_frame[indO]; + dst_frame[indO] = src_frame[indS]; + dst_frame[indS] = tmpVal; + } // end for (height) + } // end for(width) + const int lengthW = src_width >> 2; + const int lengthH = src_height >> 1; + // V + int zeroInd = src_width * src_height; + for (wind = 0; wind < lengthW; wind++ ) + { + for (hind = 0; hind < lengthH; hind++ ) + { + indO = zeroInd + hind * halfW + wind; + indS = zeroInd + hind * halfW + (halfW - wind - 1);// swapping index + tmpVal = src_frame[indO]; + dst_frame[indO] = src_frame[indS]; + dst_frame[indS] = tmpVal; + } // end for (height) + } // end for(width) + + // U + zeroInd += src_width * src_height >> 2; + for (wind = 0; wind < lengthW; wind++ ) + { + for (hind = 0; hind < lengthH; hind++ ) + { + indO = zeroInd + hind * halfW + wind; + indS = zeroInd + hind * halfW + (halfW - wind - 1);// swapping index + tmpVal = src_frame[indO]; + dst_frame[indO] = src_frame[indS]; + dst_frame[indS] = tmpVal; + } // end for (height) + } // end for(width) + + return 0; +} + + +// Make a center cut +int +CutI420Frame(uint8* frame, + int fromWidth, int fromHeight, + int toWidth, int toHeight) +{ + if (toWidth < 1 || fromWidth < 1 || toHeight < 1 || fromHeight < 1 ) + { + return -1; + } + if (toWidth == fromWidth && toHeight == fromHeight) + { + // Nothing to do + return 3 * toHeight * toWidth / 2; + } + if (toWidth > fromWidth || toHeight > fromHeight) + { + // error + return -1; + } + int i = 0; + int m = 0; + int loop = 0; + int halfToWidth = toWidth / 2; + int halfToHeight = toHeight / 2; + int halfFromWidth = fromWidth / 2; + int halfFromHeight= fromHeight / 2; + int cutHeight = ( fromHeight - toHeight ) / 2; + int cutWidth = ( fromWidth - toWidth ) / 2; + + for (i = fromWidth * cutHeight + cutWidth; loop < toHeight ; + loop++, i += fromWidth) + { + memcpy(&frame[m],&frame[i],toWidth); + m += toWidth; + } + i = fromWidth * fromHeight; // ilum + loop = 0; + for ( i += (halfFromWidth * cutHeight / 2 + cutWidth / 2); + loop < halfToHeight; loop++,i += halfFromWidth) + { + memcpy(&frame[m],&frame[i],halfToWidth); + m += halfToWidth; + } + loop = 0; + i = fromWidth * fromHeight + halfFromHeight * halfFromWidth; // ilum + Cr + for ( i += (halfFromWidth * cutHeight / 2 + cutWidth / 2); + loop < halfToHeight; loop++, i += halfFromWidth) + { + memcpy(&frame[m],&frame[i],halfToWidth); + m += halfToWidth; + } + return halfToWidth * toHeight * 3; +} + +int +CutPadI420Frame(const uint8* inFrame, int inWidth, + int inHeight, uint8* outFrame, + int outWidth, int outHeight) +{ + if (inWidth < 1 || outWidth < 1 || inHeight < 1 || outHeight < 1 ) + { + return -1; + } + if (inWidth == outWidth && inHeight == outHeight) + { + memcpy(outFrame, inFrame, 3 * outWidth * (outHeight >> 1)); + } + else + { + if ( inHeight < outHeight) + { + // pad height + int padH = outHeight - inHeight; + int i = 0; + int padW = 0; + int cutW = 0; + int width = inWidth; + if (inWidth < outWidth) + { + // pad width + padW = outWidth - inWidth; + } + else + { + // cut width + cutW = inWidth - outWidth; + width = outWidth; + } + if (padH) + { + memset(outFrame, 0, outWidth * (padH >> 1)); + outFrame += outWidth * (padH >> 1); + } + for (i = 0; i < inHeight;i++) + { + if (padW) + { + memset(outFrame, 0, padW / 2); + outFrame += padW / 2; + } + inFrame += cutW >> 1; // in case we have a cut + memcpy(outFrame,inFrame ,width); + inFrame += cutW >> 1; + outFrame += width; + inFrame += width; + if (padW) + { + memset(outFrame, 0, padW / 2); + outFrame += padW / 2; + } + } + if (padH) + { + memset(outFrame, 0, outWidth * (padH >> 1)); + outFrame += outWidth * (padH >> 1); + } + if (padH) + { + memset(outFrame, 127, (outWidth >> 2) * (padH >> 1)); + outFrame += (outWidth >> 2) * (padH >> 1); + } + for (i = 0; i < (inHeight >> 1); i++) + { + if (padW) + { + memset(outFrame, 127, padW >> 2); + outFrame += padW >> 2; + } + inFrame += cutW >> 2; // in case we have a cut + memcpy(outFrame, inFrame,width >> 1); + inFrame += cutW >> 2; + outFrame += width >> 1; + inFrame += width >> 1; + if (padW) + { + memset(outFrame, 127, padW >> 2); + outFrame += padW >> 2; + } + } + if (padH) + { + memset(outFrame, 127, (outWidth >> 1) * (padH >> 1)); + outFrame += (outWidth >> 1) * (padH >> 1); + } + for (i = 0; i < (inHeight >> 1); i++) + { + if (padW) + { + memset(outFrame, 127, padW >> 2); + outFrame += padW >> 2; + } + inFrame += cutW >> 2; // in case we have a cut + memcpy(outFrame, inFrame,width >> 1); + inFrame += cutW >> 2; + outFrame += width >> 1; + inFrame += width >> 1; + if (padW) + { + memset(outFrame, 127, padW >> 2); + outFrame += padW >> 2; + } + } + if (padH) + { + memset(outFrame, 127, (outWidth >> 2) * (padH >> 1)); + outFrame += (outWidth >> 2) * (padH >> 1); + } + } + else + { + // cut height + int i = 0; + int padW = 0; + int cutW = 0; + int width = inWidth; + + if (inWidth < outWidth) + { + // pad width + padW = outWidth - inWidth; + } else + { + // cut width + cutW = inWidth - outWidth; + width = outWidth; + } + int diffH = inHeight - outHeight; + inFrame += inWidth * (diffH >> 1); // skip top I + + for (i = 0; i < outHeight; i++) + { + if (padW) + { + memset(outFrame, 0, padW / 2); + outFrame += padW / 2; + } + inFrame += cutW >> 1; // in case we have a cut + memcpy(outFrame,inFrame ,width); + inFrame += cutW >> 1; + outFrame += width; + inFrame += width; + if (padW) + { + memset(outFrame, 0, padW / 2); + outFrame += padW / 2; + } + } + inFrame += inWidth * (diffH >> 1); // skip end I + inFrame += (inWidth >> 2) * (diffH >> 1); // skip top of Cr + for (i = 0; i < (outHeight >> 1); i++) + { + if (padW) + { + memset(outFrame, 127, padW >> 2); + outFrame += padW >> 2; + } + inFrame += cutW >> 2; // in case we have a cut + memcpy(outFrame, inFrame,width >> 1); + inFrame += cutW >> 2; + outFrame += width >> 1; + inFrame += width >> 1; + if (padW) + { + memset(outFrame, 127, padW >> 2); + outFrame += padW >> 2; + } + } + inFrame += (inWidth >> 2) * (diffH >> 1); // skip end of Cr + inFrame += (inWidth >> 2) * (diffH >> 1); // skip top of Cb + for (i = 0; i < (outHeight >> 1); i++) + { + if (padW) + { + memset(outFrame, 127, padW >> 2); + outFrame += padW >> 2; + } + inFrame += cutW >> 2; // in case we have a cut + memcpy(outFrame, inFrame, width >> 1); + inFrame += cutW >> 2; + outFrame += width >> 1; + inFrame += width >> 1; + if (padW) + { + memset(outFrame, 127, padW >> 2); + outFrame += padW >> 2; + } + } + } + } + return 3 * outWidth * (outHeight >> 1); +} + +} // nmaespace libyuv diff --git a/source/linux.cc b/source/linux.cc new file mode 100644 index 000000000..1c3aa9c14 --- /dev/null +++ b/source/linux.cc @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#if defined(LINUX) || defined(ANDROID) +#include "linux.h" + +#include + +#include +#include +#include + +#include +#include + +namespace libyuv { + +static const char kCpuInfoFile[] = "/proc/cpuinfo"; +static const char kCpuMaxFreqFile[] = + "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq"; + +ProcCpuInfo::ProcCpuInfo() { +} + +ProcCpuInfo::~ProcCpuInfo() { +} + +bool ProcCpuInfo::LoadFromSystem() { + ConfigParser procfs; + if (!procfs.Open(kCpuInfoFile)) { + return false; + } + return procfs.Parse(§ions_); +}; + +bool ProcCpuInfo::GetSectionCount(size_t* count) { + if (sections_.empty()) { + return false; + } + if (count) { + *count = sections_.size(); + } + return true; +} + +bool ProcCpuInfo::GetNumCpus(int* num) { + if (sections_.empty()) { + return false; + } + int total_cpus = 0; +#if defined(__arm__) + // Count the number of blocks that have a "processor" key defined. On ARM, + // there may be extra blocks of information that aren't per-processor. + size_t section_count = sections_.size(); + for (size_t i = 0; i < section_count; ++i) { + int processor_id; + if (GetSectionIntValue(i, "processor", &processor_id)) { + ++total_cpus; + } + } + // Single core ARM systems don't include "processor" keys at all, so return + // that we have a single core if we didn't find any explicitly above. + if (total_cpus == 0) { + total_cpus = 1; + } +#else + // On X86, there is exactly one info section per processor. + total_cpus = static_cast(sections_.size()); +#endif + if (num) { + *num = total_cpus; + } + return true; +} + +bool ProcCpuInfo::GetNumPhysicalCpus(int* num) { + if (sections_.empty()) { + return false; + } + // TODO(noahric): /proc/cpuinfo only reports cores that are currently + // _online_, so this may underreport the number of physical cores. +#if defined(__arm__) + // ARM (currently) has no hyperthreading, so just return the same value + // as GetNumCpus. + return GetNumCpus(num); +#else + int total_cores = 0; + std::set physical_ids; + size_t section_count = sections_.size(); + for (size_t i = 0; i < section_count; ++i) { + int physical_id; + int cores; + // Count the cores for the physical id only if we have not counted the id. + if (GetSectionIntValue(i, "physical id", &physical_id) && + GetSectionIntValue(i, "cpu cores", &cores) && + physical_ids.find(physical_id) == physical_ids.end()) { + physical_ids.insert(physical_id); + total_cores += cores; + } + } + + if (num) { + *num = total_cores; + } + return true; +#endif +} + +bool ProcCpuInfo::GetCpuFamily(int* id) { + int cpu_family = 0; + +// shh { +#if defined(__arm__) + // On ChromeOS seaboard, there is no 'cpu family' in '/proc/cpuinfo'. But + // there is 'CPU Architecture' which can be used as 'cpu family'. + // See http://en.wikipedia.org/wiki/ARM_architecture for a good list of + // ARM cpu families, architectures, and their mappings. + // There may be multiple sessions that aren't per-processor. We need to scan + // through each session until we find the first 'CPU architecture'. + size_t section_count = sections_.size(); + for (size_t i = 0; i < section_count; ++i) { + if (GetSectionIntValue(i, "CPU architecture", &cpu_family)) { + // We returns the first one (if there are multiple entries). + break; + }; + } +#else +// shh } + GetSectionIntValue(0, "cpu family", &cpu_family); +// shh { +#endif +// shh } + + if (id) { + *id = cpu_family; + } + return true; +} + +bool ProcCpuInfo::GetSectionStringValue(size_t section_num, + const std::string& key, + std::string* result) { + if (section_num >= sections_.size()) { + return false; + } + ConfigParser::SimpleMap::iterator iter = sections_[section_num].find(key); + if (iter == sections_[section_num].end()) { + return false; + } + *result = iter->second; + return true; +} + +bool ProcCpuInfo::GetSectionIntValue(size_t section_num, + const std::string& key, + int* result) { + if (section_num >= sections_.size()) { + return false; + } + ConfigParser::SimpleMap::iterator iter = sections_[section_num].find(key); + if (iter == sections_[section_num].end()) { + return false; + } + return FromString(iter->second, result); +} + +ConfigParser::ConfigParser() {} + +ConfigParser::~ConfigParser() {} + +bool ConfigParser::Open(const std::string& filename) { + FileStream* fs = new FileStream(); + if (!fs->Open(filename, "r", NULL)) { + return false; + } + instream_.reset(fs); + return true; +} + +void ConfigParser::Attach(StreamInterface* stream) { + instream_.reset(stream); +} + +bool ConfigParser::Parse(MapVector* key_val_pairs) { + // Parses the file and places the found key-value pairs into key_val_pairs. + SimpleMap section; + while (ParseSection(§ion)) { + key_val_pairs->push_back(section); + section.clear(); + } + return (!key_val_pairs->empty()); +} + +bool ConfigParser::ParseSection(SimpleMap* key_val_pair) { + // Parses the next section in the filestream and places the found key-value + // pairs into key_val_pair. + std::string key, value; + while (ParseLine(&key, &value)) { + (*key_val_pair)[key] = value; + } + return (!key_val_pair->empty()); +} + +bool ConfigParser::ParseLine(std::string* key, std::string* value) { + // Parses the next line in the filestream and places the found key-value + // pair into key and val. + std::string line; + if ((instream_->ReadLine(&line)) == EOF) { + return false; + } + std::vector tokens; + if (2 != split(line, ':', &tokens)) { + return false; + } + // Removes whitespace at the end of Key name + size_t pos = tokens[0].length() - 1; + while ((pos > 0) && isspace(tokens[0][pos])) { + pos--; + } + tokens[0].erase(pos + 1); + // Removes whitespace at the start of value + pos = 0; + while (pos < tokens[1].length() && isspace(tokens[1][pos])) { + pos++; + } + tokens[1].erase(0, pos); + *key = tokens[0]; + *value = tokens[1]; + return true; +} + +static bool ExpectLineFromStream(FileStream* stream, + std::string* out) { + StreamResult res = stream->ReadLine(out); + if (res != SR_SUCCESS) { + if (res != SR_EOS) { + LOG(LS_ERROR) << "Error when reading from stream"; + } else { + LOG(LS_ERROR) << "Incorrect number of lines in stream"; + } + return false; + } + return true; +} + +static void ExpectEofFromStream(FileStream* stream) { + std::string unused; + StreamResult res = stream->ReadLine(&unused); + if (res == SR_SUCCESS) { + LOG(LS_WARNING) << "Ignoring unexpected extra lines from stream"; + } else if (res != SR_EOS) { + LOG(LS_WARNING) << "Error when checking for extra lines from stream"; + } +} + +// For caching the lsb_release output (reading it invokes a sub-process and +// hence is somewhat expensive). +static std::string lsb_release_string; +static CriticalSection lsb_release_string_critsec; + +std::string ReadLinuxLsbRelease() { + CritScope cs(&lsb_release_string_critsec); + if (!lsb_release_string.empty()) { + // Have cached result from previous call. + return lsb_release_string; + } + // No cached result. Run lsb_release and parse output. + POpenStream lsb_release_output; + if (!lsb_release_output.Open("lsb_release -idrcs", "r", NULL)) { + LOG_ERR(LS_ERROR) << "Can't run lsb_release"; + return lsb_release_string; // empty + } + // Read in the command's output and build the string. + std::ostringstream sstr; + std::string line; + int wait_status; + + if (!ExpectLineFromStream(&lsb_release_output, &line)) { + return lsb_release_string; // empty + } + sstr << "DISTRIB_ID=" << line; + + if (!ExpectLineFromStream(&lsb_release_output, &line)) { + return lsb_release_string; // empty + } + sstr << " DISTRIB_DESCRIPTION=\"" << line << '"'; + + if (!ExpectLineFromStream(&lsb_release_output, &line)) { + return lsb_release_string; // empty + } + sstr << " DISTRIB_RELEASE=" << line; + + if (!ExpectLineFromStream(&lsb_release_output, &line)) { + return lsb_release_string; // empty + } + sstr << " DISTRIB_CODENAME=" << line; + + // Should not be anything left. + ExpectEofFromStream(&lsb_release_output); + + lsb_release_output.Close(); + wait_status = lsb_release_output.GetWaitStatus(); + if (wait_status == -1 || + !WIFEXITED(wait_status) || + WEXITSTATUS(wait_status) != 0) { + LOG(LS_WARNING) << "Unexpected exit status from lsb_release"; + } + + lsb_release_string = sstr.str(); + + return lsb_release_string; +} + +std::string ReadLinuxUname() { + struct utsname buf; + if (uname(&buf) < 0) { + LOG_ERR(LS_ERROR) << "Can't call uname()"; + return std::string(); + } + std::ostringstream sstr; + sstr << buf.sysname << " " + << buf.release << " " + << buf.version << " " + << buf.machine; + return sstr.str(); +} + +int ReadCpuMaxFreq() { + FileStream fs; + std::string str; + int freq = -1; + if (!fs.Open(kCpuMaxFreqFile, "r", NULL) || + SR_SUCCESS != fs.ReadLine(&str) || + !FromString(str, &freq)) { + return -1; + } + return freq; +} + +} // namespace libyuv + +#endif // defined(LINUX) || defined(ANDROID) diff --git a/source/linux.h b/source/linux.h new file mode 100644 index 000000000..3d5d0c8f4 --- /dev/null +++ b/source/linux.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef LIBYUV_SOURCE_LINUX_H_ +#define LIBYUV_SOURCE_LINUX_H_ + +#if defined(LINUX) || defined(ANDROID) +#include +#include +#include + +namespace libyuv { + +////////////////////////////////////////////////////////////////////////////// +// ConfigParser parses a FileStream of an ".ini."-type format into a map. +////////////////////////////////////////////////////////////////////////////// + +// Sample Usage: +// ConfigParser parser; +// ConfigParser::MapVector key_val_pairs; +// if (parser.Open(inifile) && parser.Parse(&key_val_pairs)) { +// for (int section_num=0; i < key_val_pairs.size(); ++section_num) { +// std::string val1 = key_val_pairs[section_num][key1]; +// std::string val2 = key_val_pairs[section_num][key2]; +// // Do something with valn; +// } +// } + +class ConfigParser { + public: + typedef std::map SimpleMap; + typedef std::vector MapVector; + + ConfigParser(); + virtual ~ConfigParser(); + + virtual bool Open(const std::string& filename); + virtual void Attach(StreamInterface* stream); + virtual bool Parse(MapVector* key_val_pairs); + virtual bool ParseSection(SimpleMap* key_val_pair); + virtual bool ParseLine(std::string* key, std::string* value); + + private: + scoped_ptr instream_; +}; + +////////////////////////////////////////////////////////////////////////////// +// ProcCpuInfo reads CPU info from the /proc subsystem on any *NIX platform. +////////////////////////////////////////////////////////////////////////////// + +// Sample Usage: +// ProcCpuInfo proc_info; +// int no_of_cpu; +// if (proc_info.LoadFromSystem()) { +// std::string out_str; +// proc_info.GetNumCpus(&no_of_cpu); +// proc_info.GetCpuStringValue(0, "vendor_id", &out_str); +// } +// } + +class ProcCpuInfo { + public: + ProcCpuInfo(); + virtual ~ProcCpuInfo(); + + // Reads the proc subsystem's cpu info into memory. If this fails, this + // returns false; if it succeeds, it returns true. + virtual bool LoadFromSystem(); + + // Obtains the number of logical CPU threads and places the value num. + virtual bool GetNumCpus(int* num); + + // Obtains the number of physical CPU cores and places the value num. + virtual bool GetNumPhysicalCpus(int* num); + + // Obtains the CPU family id. + virtual bool GetCpuFamily(int* id); + + // Obtains the number of sections in /proc/cpuinfo, which may be greater + // than the number of CPUs (e.g. on ARM) + virtual bool GetSectionCount(size_t* count); + + // Looks for the CPU proc item with the given name for the given section + // number and places the string value in result. + virtual bool GetSectionStringValue(size_t section_num, const std::string& key, + std::string* result); + + // Looks for the CPU proc item with the given name for the given section + // number and places the int value in result. + virtual bool GetSectionIntValue(size_t section_num, const std::string& key, + int* result); + + private: + ConfigParser::MapVector sections_; +}; + +// Builds a string containing the info from lsb_release on a single line. +std::string ReadLinuxLsbRelease(); + +// Returns the output of "uname". +std::string ReadLinuxUname(); + +// Returns the content (int) of +// /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq +// Returns -1 on error. +int ReadCpuMaxFreq(); + +} // namespace libyuv + +#endif // defined(LINUX) || defined(ANDROID) +#endif // LIBYUV_SOURCE_LINUX_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc new file mode 100644 index 000000000..452543d07 --- /dev/null +++ b/source/planar_functions.cc @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "planar_functions.h" + +#include + +#include "cpu_id.h" + +namespace libyuv { + +#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED) +#define HAS_SPLITUV_NEON +// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v +// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels. +static void SplitUV_NEON(const uint8* src_uv, + uint8* dst_u, uint8* dst_v, int pix) { + __asm__ volatile + ( + "1:\n" + "vld2.u8 {q0,q1}, [%0]! \n" // load 16 pairs of UV + "vst1.u8 {q0}, [%1]! \n" // store U + "vst1.u8 {q1}, [%2]! \n" // Store V + "subs %3, %3, #16 \n" // 16 processed per loop + "bhi 1b \n" + : // Output registers + : "r"(src_uv), "r"(dst_u), "r"(dst_v), "r"(pix) // Input registers + : "q0", "q1" // Clobber List + ); +} + +#elif defined(WIN32) && !defined(COVERAGE_ENABLED) +#define HAS_SPLITUV_SSE2 +static void SplitUV_SSE2(const uint8* src_uv, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + mov esi, src_uv + mov edi, dst_u + mov edx, dst_v + mov ecx, pix + mov eax, 0x00ff00ff // mask for isolating low bytes + movd xmm7, eax + pshufd xmm7, xmm7, 0 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + lea esi, [esi + 32] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + pand xmm0, xmm7 // even bytes + pand xmm1, xmm7 + packuswb xmm0, xmm1 + movdqa [edi], xmm0 + lea edi, [edi + 16] + psrlw xmm2, 8 // odd bytes + psrlw xmm3, 8 + packuswb xmm2, xmm3 + movdqa [edx], xmm2 + lea edx, [edx + 16] + sub ecx, 16 + ja wloop + } +} + +#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \ + !TARGET_IPHONE_SIMULATOR + +// GCC version is same as Visual C + +#define HAS_SPLITUV_SSE2 +extern "C" void SplitUV_SSE2(const uint8* src_uv, + uint8* dst_u, uint8* dst_v, int pix); + + asm( + ".text\n" +#if defined(OSX) + ".globl _SplitUV_SSE2\n" +"_SplitUV_SSE2:\n" +#else + ".global SplitUV_SSE2\n" +"SplitUV_SSE2:\n" +#endif + "push %ebp\n" + "mov %esp,%ebp\n" + "push %esi\n" + "push %edi\n" + "mov 0x8(%ebp),%esi\n" + "mov 0xc(%ebp),%edi\n" + "mov 0x10(%ebp),%edx\n" + "mov 0x14(%ebp),%ecx\n" + "mov $0xff00ff,%eax\n" + "movd %eax,%xmm7\n" + "pshufd $0x0,%xmm7,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa 0x10(%esi),%xmm1\n" + "lea 0x20(%esi),%esi\n" + "movdqa %xmm0,%xmm2\n" + "movdqa %xmm1,%xmm3\n" + "pand %xmm7,%xmm0\n" + "pand %xmm7,%xmm1\n" + "packuswb %xmm1,%xmm0\n" + "movdqa %xmm0,(%edi)\n" + "lea 0x10(%edi),%edi\n" + "psrlw $0x8,%xmm2\n" + "psrlw $0x8,%xmm3\n" + "packuswb %xmm3,%xmm2\n" + "movdqa %xmm2,(%edx)\n" + "lea 0x10(%edx),%edx\n" + "sub $0x10,%ecx\n" + "ja 1b\n" + "pop %edi\n" + "pop %esi\n" + "pop %ebp\n" + "ret\n" +); +#endif + +static void SplitUV_C(const uint8* src_uv, + uint8* dst_u, uint8* dst_v, int pix) { + // Copy a row of UV. + for (int x = 0; x < pix; ++x) { + dst_u[0] = src_uv[0]; + dst_v[0] = src_uv[1]; + src_uv += 2; + dst_u += 1; + dst_v += 1; + } +} + +static void I420CopyPlane(const uint8* src_y, int src_pitch_y, + uint8* dst_y, int dst_pitch_y, + int width, int height) { + // Copy plane + for (int y = 0; y < height; ++y) { + memcpy(dst_y, src_y, width); + src_y += src_pitch_y; + dst_y += dst_pitch_y; + } +} + +static void I420CopyPlane2(const uint8* src, int src_pitch_0, int src_pitch_1, + uint8* dst, int dst_pitch, + int width, int height) { + // Copy plane + for (int y = 0; y < height; y += 2) { + memcpy(dst, src, width); + src += src_pitch_0; + dst += dst_pitch; + memcpy(dst, src, width); + src += src_pitch_1; + dst += dst_pitch; + } +} + +// Support converting from FOURCC_M420 +// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for +// easy conversion to I420. +// M420 format description: +// M420 is row biplanar 420: 2 rows of Y and 1 row of VU. +// Chroma is half width / half height. (420) +// pitch_m420 is row planar. Normally this will be the width in pixels. +// The UV plane is half width, but 2 values, so pitch_m420 applies to this +// as well as the two Y planes. +// TODO(fbarchard): Do NV21/NV12 formats with this function +static void X420ToI420(uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + const uint8* src_y, + int src_pitch_y0, int src_pitch_y1, + const uint8* src_uv, int src_pitch_uv, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_pitch_y; + dst_u = dst_u + (height - 1) * dst_pitch_u; + dst_v = dst_v + (height - 1) * dst_pitch_v; + dst_pitch_y = -dst_pitch_y; + dst_pitch_u = -dst_pitch_u; + dst_pitch_v = -dst_pitch_v; + } + + int halfwidth = (width + 1) >> 1; + void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +#if defined(HAS_SPLITUV_NEON) + if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasNEON) && + (halfwidth % 16 == 0) && + IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) && + IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) && + IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) { + SplitUV = SplitUV_NEON; + } else +#elif defined(HAS_SPLITUV_SSE2) + if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSE2) && + (halfwidth % 16 == 0) && + IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) && + IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) && + IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) { + SplitUV = SplitUV_SSE2; + } else +#endif + { + SplitUV = SplitUV_C; + } + + I420CopyPlane2(src_y, src_pitch_y0, src_pitch_y1, dst_y, dst_pitch_y, + width, height); + + int halfheight = (height + 1) >> 1; + for (int y = 0; y < halfheight; ++y) { + // Copy a row of UV. + SplitUV(src_uv, dst_u, dst_v, halfwidth); + dst_u += dst_pitch_u; + dst_v += dst_pitch_v; + src_uv += src_pitch_uv; + } +} + +// TODO(fbarchard): For biplanar formats (ie NV21), the Y plane is the same +// as I420, and only the chroma plane varies. Copy the Y plane by reference, +// and just convert the UV. This method can be used for NV21, NV12, I420, +// I422, M422. 8 of the 12 bits is Y, so this would copy 3 times less data, +// which is approximately how much faster it would be. + +// Helper function to copy yuv data without scaling. Used +// by our jpeg conversion callbacks to incrementally fill a yuv image. +void PlanarFunctions::I420Copy(const uint8* src_y, int src_pitch_y, + const uint8* src_u, int src_pitch_u, + const uint8* src_v, int src_pitch_v, + uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_pitch_y; + src_u = src_u + (height - 1) * src_pitch_u; + src_v = src_v + (height - 1) * src_pitch_v; + src_pitch_y = -src_pitch_y; + src_pitch_u = -src_pitch_u; + src_pitch_v = -src_pitch_v; + } + + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + I420CopyPlane(src_y, src_pitch_y, dst_y, dst_pitch_y, width, height); + I420CopyPlane(src_u, src_pitch_u, dst_u, dst_pitch_u, halfwidth, halfheight); + I420CopyPlane(src_v, src_pitch_v, dst_v, dst_pitch_v, halfwidth, halfheight); +} + +// Helper function to copy yuv data without scaling. Used +// by our jpeg conversion callbacks to incrementally fill a yuv image. +void PlanarFunctions::I422ToI420(const uint8* src_y, int src_pitch_y, + const uint8* src_u, int src_pitch_u, + const uint8* src_v, int src_pitch_v, + uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_pitch_y; + src_u = src_u + (height - 1) * src_pitch_u; + src_v = src_v + (height - 1) * src_pitch_v; + src_pitch_y = -src_pitch_y; + src_pitch_u = -src_pitch_u; + src_pitch_v = -src_pitch_v; + } + + // Copy Y plane + I420CopyPlane(src_y, src_pitch_y, dst_y, dst_pitch_y, width, height); + + // SubSample UV planes. + int x, y; + int halfwidth = (width + 1) >> 1; + for (y = 0; y < height; y += 2) { + const uint8* u0 = src_u; + const uint8* u1 = src_u + src_pitch_u; + if ((y + 1) >= height) { + u1 = u0; + } + for (x = 0; x < halfwidth; ++x) { + dst_u[x] = (u0[x] + u1[x] + 1) >> 1; + } + src_u += src_pitch_u * 2; + dst_u += dst_pitch_u; + } + for (y = 0; y < height; y += 2) { + const uint8* v0 = src_v; + const uint8* v1 = src_v + src_pitch_v; + if ((y + 1) >= height) { + v1 = v0; + } + for (x = 0; x < halfwidth; ++x) { + dst_v[x] = (v0[x] + v1[x] + 1) >> 1; + } + src_v += src_pitch_v * 2; + dst_v += dst_pitch_v; + } +} + +// Convert M420 to I420. +void PlanarFunctions::M420ToI420(uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + const uint8* m420, int pitch_m420, + int width, int height) { + X420ToI420(dst_y, dst_pitch_y, dst_u, dst_pitch_u, dst_v, dst_pitch_v, + m420, pitch_m420, pitch_m420 * 2, + m420 + pitch_m420 * 2, pitch_m420 * 3, + width, height); +} + +// Convert NV12 to I420. +void PlanarFunctions::NV12ToI420(uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + const uint8* src_y, + const uint8* src_uv, + int src_pitch, + int width, int height) { + X420ToI420(dst_y, dst_pitch_y, dst_u, dst_pitch_u, dst_v, dst_pitch_v, + src_y, src_pitch, src_pitch, + src_uv, src_pitch, + width, height); +} + +} // namespace libyuv diff --git a/source/row.h b/source/row.h new file mode 100644 index 000000000..4d50bf60d --- /dev/null +++ b/source/row.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef LIBYUV_SOURCE_ROW_H_ +#define LIBYUV_SOURCE_ROW_H_ + +#include "basic_types.h" + +extern "C" { +// Can only do 1x. +// This is the second fastest of the scalers. +void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +#if defined(_MSC_VER) +#define SIMD_ALIGNED(var) __declspec(align(16)) var +#else +#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +#endif +#ifdef OSX +extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]); +#else +extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]); +#endif + +// Method to force C version. +//#define USE_MMX 0 +//#define USE_SSE2 0 + +#if !defined(USE_MMX) +// Windows, Mac and Linux use MMX +#if defined(__i386__) || defined(_MSC_VER) +#define USE_MMX 1 +#else +#define USE_MMX 0 +#endif +#endif + +#if !defined(USE_SSE2) +#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2 +#define USE_SSE2 1 +#else +#define USE_SSE2 0 +#endif +#endif + +// x64 uses MMX2 (SSE) so emms is not required. +// Warning C4799: function has no EMMS instruction. +// EMMS() is slow and should be called by the calling function once per image. +#if USE_MMX && !defined(ARCH_CPU_X86_64) +#if defined(_MSC_VER) +#define EMMS() __asm emms +#pragma warning(disable: 4799) +#else +#define EMMS() asm("emms") +#endif +#else +#define EMMS() +#endif + +} // extern "C" + +#endif // LIBYUV_SOURCE_ROW_H_ diff --git a/source/row_posix.cc b/source/row_posix.cc new file mode 100644 index 000000000..61d89a167 --- /dev/null +++ b/source/row_posix.cc @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "row.h" + +extern "C" { + +#if defined(__x86_64__) + +// 64 bit linux gcc version + +void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 + asm( +"1:" + "movzb (%1),%%r10\n" + "lea 1(%1),%1\n" + "movzb (%2),%%r11\n" + "lea 1(%2),%2\n" + "movq 2048(%5,%%r10,8),%%xmm0\n" + "movzb (%0),%%r10\n" + "movq 4096(%5,%%r11,8),%%xmm1\n" + "movzb 0x1(%0),%%r11\n" + "paddsw %%xmm1,%%xmm0\n" + "movq (%5,%%r10,8),%%xmm2\n" + "lea 2(%0),%0\n" + "movq (%5,%%r11,8),%%xmm3\n" + "paddsw %%xmm0,%%xmm2\n" + "paddsw %%xmm0,%%xmm3\n" + "shufps $0x44,%%xmm3,%%xmm2\n" + "psraw $0x6,%%xmm2\n" + "packuswb %%xmm2,%%xmm2\n" + "movq %%xmm2,0x0(%3)\n" + "lea 8(%3),%3\n" + "sub $0x2,%4\n" + "ja 1b\n" + : + : "r"(y_buf), // %0 + "r"(u_buf), // %1 + "r"(v_buf), // %2 + "r"(rgb_buf), // %3 + "r"(width), // %4 + "r" (_kCoefficientsRgbY) // %5 + : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" +); +} + +#elif defined(__i386__) +// 32 bit gcc version + +void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + asm( + ".text\n" +#if defined(OSX) || defined(IOS) + ".globl _FastConvertYUVToRGB32Row\n" +"_FastConvertYUVToRGB32Row:\n" +#else + ".global FastConvertYUVToRGB32Row\n" +"FastConvertYUVToRGB32Row:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + +"1:" + "movzbl (%edi),%eax\n" + "lea 1(%edi),%edi\n" + "movzbl (%esi),%ebx\n" + "lea 1(%esi),%esi\n" + "movq _kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" + "movzbl 0x1(%edx),%ebx\n" + "movq _kCoefficientsRgbY(,%eax,8),%mm1\n" + "lea 2(%edx),%edx\n" + "movq _kCoefficientsRgbY(,%ebx,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "lea 8(%ebp),%ebp\n" + "sub $0x2,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +#else +// C reference code that mimic the YUV assembly. +#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) +#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ + (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) + +static inline void YuvPixel(uint8 y, + uint8 u, + uint8 v, + uint8* rgb_buf) { + + int b = _kCoefficientsRgbY[256+u][0]; + int g = _kCoefficientsRgbY[256+u][1]; + int r = _kCoefficientsRgbY[256+u][2]; + int a = _kCoefficientsRgbY[256+u][3]; + + b = paddsw(b, _kCoefficientsRgbY[512+v][0]); + g = paddsw(g, _kCoefficientsRgbY[512+v][1]); + r = paddsw(r, _kCoefficientsRgbY[512+v][2]); + a = paddsw(a, _kCoefficientsRgbY[512+v][3]); + + b = paddsw(b, _kCoefficientsRgbY[y][0]); + g = paddsw(g, _kCoefficientsRgbY[y][1]); + r = paddsw(r, _kCoefficientsRgbY[y][2]); + a = paddsw(a, _kCoefficientsRgbY[y][3]); + + b >>= 6; + g >>= 6; + r >>= 6; + a >>= 6; + + *reinterpret_cast(rgb_buf) = (packuswb(b)) | + (packuswb(g) << 8) | + (packuswb(r) << 16) | + (packuswb(a) << 24); +} + +void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width; x += 2) { + uint8 u = u_buf[x >> 1]; + uint8 v = v_buf[x >> 1]; + uint8 y0 = y_buf[x]; + YuvPixel(y0, u, v, rgb_buf); + if ((x + 1) < width) { + uint8 y1 = y_buf[x + 1]; + YuvPixel(y1, u, v, rgb_buf + 4); + } + rgb_buf += 8; // Advance 2 pixels. + } +} +#endif + +} // extern "C" diff --git a/source/row_table.cc b/source/row_table.cc new file mode 100644 index 000000000..6a97da831 --- /dev/null +++ b/source/row_table.cc @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "row.h" + +extern "C" { + +#define RGBY(i) { \ + static_cast(1.164 * 64 * (i - 16) + 0.5), \ + static_cast(1.164 * 64 * (i - 16) + 0.5), \ + static_cast(1.164 * 64 * (i - 16) + 0.5), \ + 0 \ +} + +#define RGBU(i) { \ + static_cast(2.018 * 64 * (i - 128) + 0.5), \ + static_cast(-0.391 * 64 * (i - 128) + 0.5), \ + 0, \ + static_cast(256 * 64 - 1) \ +} + +#define RGBV(i) { \ + 0, \ + static_cast(-0.813 * 64 * (i - 128) + 0.5), \ + static_cast(1.596 * 64 * (i - 128) + 0.5), \ + 0 \ +} + +#ifdef OSX +SIMD_ALIGNED(const int16 kCoefficientsRgbY[256 * 3][4]) = { +#else +SIMD_ALIGNED(const int16 _kCoefficientsRgbY[256 * 3][4]) = { +#endif + + // Luminance table. + RGBY(0x00), RGBY(0x01), RGBY(0x02), RGBY(0x03), + RGBY(0x04), RGBY(0x05), RGBY(0x06), RGBY(0x07), + RGBY(0x08), RGBY(0x09), RGBY(0x0A), RGBY(0x0B), + RGBY(0x0C), RGBY(0x0D), RGBY(0x0E), RGBY(0x0F), + RGBY(0x10), RGBY(0x11), RGBY(0x12), RGBY(0x13), + RGBY(0x14), RGBY(0x15), RGBY(0x16), RGBY(0x17), + RGBY(0x18), RGBY(0x19), RGBY(0x1A), RGBY(0x1B), + RGBY(0x1C), RGBY(0x1D), RGBY(0x1E), RGBY(0x1F), + RGBY(0x20), RGBY(0x21), RGBY(0x22), RGBY(0x23), + RGBY(0x24), RGBY(0x25), RGBY(0x26), RGBY(0x27), + RGBY(0x28), RGBY(0x29), RGBY(0x2A), RGBY(0x2B), + RGBY(0x2C), RGBY(0x2D), RGBY(0x2E), RGBY(0x2F), + RGBY(0x30), RGBY(0x31), RGBY(0x32), RGBY(0x33), + RGBY(0x34), RGBY(0x35), RGBY(0x36), RGBY(0x37), + RGBY(0x38), RGBY(0x39), RGBY(0x3A), RGBY(0x3B), + RGBY(0x3C), RGBY(0x3D), RGBY(0x3E), RGBY(0x3F), + RGBY(0x40), RGBY(0x41), RGBY(0x42), RGBY(0x43), + RGBY(0x44), RGBY(0x45), RGBY(0x46), RGBY(0x47), + RGBY(0x48), RGBY(0x49), RGBY(0x4A), RGBY(0x4B), + RGBY(0x4C), RGBY(0x4D), RGBY(0x4E), RGBY(0x4F), + RGBY(0x50), RGBY(0x51), RGBY(0x52), RGBY(0x53), + RGBY(0x54), RGBY(0x55), RGBY(0x56), RGBY(0x57), + RGBY(0x58), RGBY(0x59), RGBY(0x5A), RGBY(0x5B), + RGBY(0x5C), RGBY(0x5D), RGBY(0x5E), RGBY(0x5F), + RGBY(0x60), RGBY(0x61), RGBY(0x62), RGBY(0x63), + RGBY(0x64), RGBY(0x65), RGBY(0x66), RGBY(0x67), + RGBY(0x68), RGBY(0x69), RGBY(0x6A), RGBY(0x6B), + RGBY(0x6C), RGBY(0x6D), RGBY(0x6E), RGBY(0x6F), + RGBY(0x70), RGBY(0x71), RGBY(0x72), RGBY(0x73), + RGBY(0x74), RGBY(0x75), RGBY(0x76), RGBY(0x77), + RGBY(0x78), RGBY(0x79), RGBY(0x7A), RGBY(0x7B), + RGBY(0x7C), RGBY(0x7D), RGBY(0x7E), RGBY(0x7F), + RGBY(0x80), RGBY(0x81), RGBY(0x82), RGBY(0x83), + RGBY(0x84), RGBY(0x85), RGBY(0x86), RGBY(0x87), + RGBY(0x88), RGBY(0x89), RGBY(0x8A), RGBY(0x8B), + RGBY(0x8C), RGBY(0x8D), RGBY(0x8E), RGBY(0x8F), + RGBY(0x90), RGBY(0x91), RGBY(0x92), RGBY(0x93), + RGBY(0x94), RGBY(0x95), RGBY(0x96), RGBY(0x97), + RGBY(0x98), RGBY(0x99), RGBY(0x9A), RGBY(0x9B), + RGBY(0x9C), RGBY(0x9D), RGBY(0x9E), RGBY(0x9F), + RGBY(0xA0), RGBY(0xA1), RGBY(0xA2), RGBY(0xA3), + RGBY(0xA4), RGBY(0xA5), RGBY(0xA6), RGBY(0xA7), + RGBY(0xA8), RGBY(0xA9), RGBY(0xAA), RGBY(0xAB), + RGBY(0xAC), RGBY(0xAD), RGBY(0xAE), RGBY(0xAF), + RGBY(0xB0), RGBY(0xB1), RGBY(0xB2), RGBY(0xB3), + RGBY(0xB4), RGBY(0xB5), RGBY(0xB6), RGBY(0xB7), + RGBY(0xB8), RGBY(0xB9), RGBY(0xBA), RGBY(0xBB), + RGBY(0xBC), RGBY(0xBD), RGBY(0xBE), RGBY(0xBF), + RGBY(0xC0), RGBY(0xC1), RGBY(0xC2), RGBY(0xC3), + RGBY(0xC4), RGBY(0xC5), RGBY(0xC6), RGBY(0xC7), + RGBY(0xC8), RGBY(0xC9), RGBY(0xCA), RGBY(0xCB), + RGBY(0xCC), RGBY(0xCD), RGBY(0xCE), RGBY(0xCF), + RGBY(0xD0), RGBY(0xD1), RGBY(0xD2), RGBY(0xD3), + RGBY(0xD4), RGBY(0xD5), RGBY(0xD6), RGBY(0xD7), + RGBY(0xD8), RGBY(0xD9), RGBY(0xDA), RGBY(0xDB), + RGBY(0xDC), RGBY(0xDD), RGBY(0xDE), RGBY(0xDF), + RGBY(0xE0), RGBY(0xE1), RGBY(0xE2), RGBY(0xE3), + RGBY(0xE4), RGBY(0xE5), RGBY(0xE6), RGBY(0xE7), + RGBY(0xE8), RGBY(0xE9), RGBY(0xEA), RGBY(0xEB), + RGBY(0xEC), RGBY(0xED), RGBY(0xEE), RGBY(0xEF), + RGBY(0xF0), RGBY(0xF1), RGBY(0xF2), RGBY(0xF3), + RGBY(0xF4), RGBY(0xF5), RGBY(0xF6), RGBY(0xF7), + RGBY(0xF8), RGBY(0xF9), RGBY(0xFA), RGBY(0xFB), + RGBY(0xFC), RGBY(0xFD), RGBY(0xFE), RGBY(0xFF), + + // Chroma U table. + RGBU(0x00), RGBU(0x01), RGBU(0x02), RGBU(0x03), + RGBU(0x04), RGBU(0x05), RGBU(0x06), RGBU(0x07), + RGBU(0x08), RGBU(0x09), RGBU(0x0A), RGBU(0x0B), + RGBU(0x0C), RGBU(0x0D), RGBU(0x0E), RGBU(0x0F), + RGBU(0x10), RGBU(0x11), RGBU(0x12), RGBU(0x13), + RGBU(0x14), RGBU(0x15), RGBU(0x16), RGBU(0x17), + RGBU(0x18), RGBU(0x19), RGBU(0x1A), RGBU(0x1B), + RGBU(0x1C), RGBU(0x1D), RGBU(0x1E), RGBU(0x1F), + RGBU(0x20), RGBU(0x21), RGBU(0x22), RGBU(0x23), + RGBU(0x24), RGBU(0x25), RGBU(0x26), RGBU(0x27), + RGBU(0x28), RGBU(0x29), RGBU(0x2A), RGBU(0x2B), + RGBU(0x2C), RGBU(0x2D), RGBU(0x2E), RGBU(0x2F), + RGBU(0x30), RGBU(0x31), RGBU(0x32), RGBU(0x33), + RGBU(0x34), RGBU(0x35), RGBU(0x36), RGBU(0x37), + RGBU(0x38), RGBU(0x39), RGBU(0x3A), RGBU(0x3B), + RGBU(0x3C), RGBU(0x3D), RGBU(0x3E), RGBU(0x3F), + RGBU(0x40), RGBU(0x41), RGBU(0x42), RGBU(0x43), + RGBU(0x44), RGBU(0x45), RGBU(0x46), RGBU(0x47), + RGBU(0x48), RGBU(0x49), RGBU(0x4A), RGBU(0x4B), + RGBU(0x4C), RGBU(0x4D), RGBU(0x4E), RGBU(0x4F), + RGBU(0x50), RGBU(0x51), RGBU(0x52), RGBU(0x53), + RGBU(0x54), RGBU(0x55), RGBU(0x56), RGBU(0x57), + RGBU(0x58), RGBU(0x59), RGBU(0x5A), RGBU(0x5B), + RGBU(0x5C), RGBU(0x5D), RGBU(0x5E), RGBU(0x5F), + RGBU(0x60), RGBU(0x61), RGBU(0x62), RGBU(0x63), + RGBU(0x64), RGBU(0x65), RGBU(0x66), RGBU(0x67), + RGBU(0x68), RGBU(0x69), RGBU(0x6A), RGBU(0x6B), + RGBU(0x6C), RGBU(0x6D), RGBU(0x6E), RGBU(0x6F), + RGBU(0x70), RGBU(0x71), RGBU(0x72), RGBU(0x73), + RGBU(0x74), RGBU(0x75), RGBU(0x76), RGBU(0x77), + RGBU(0x78), RGBU(0x79), RGBU(0x7A), RGBU(0x7B), + RGBU(0x7C), RGBU(0x7D), RGBU(0x7E), RGBU(0x7F), + RGBU(0x80), RGBU(0x81), RGBU(0x82), RGBU(0x83), + RGBU(0x84), RGBU(0x85), RGBU(0x86), RGBU(0x87), + RGBU(0x88), RGBU(0x89), RGBU(0x8A), RGBU(0x8B), + RGBU(0x8C), RGBU(0x8D), RGBU(0x8E), RGBU(0x8F), + RGBU(0x90), RGBU(0x91), RGBU(0x92), RGBU(0x93), + RGBU(0x94), RGBU(0x95), RGBU(0x96), RGBU(0x97), + RGBU(0x98), RGBU(0x99), RGBU(0x9A), RGBU(0x9B), + RGBU(0x9C), RGBU(0x9D), RGBU(0x9E), RGBU(0x9F), + RGBU(0xA0), RGBU(0xA1), RGBU(0xA2), RGBU(0xA3), + RGBU(0xA4), RGBU(0xA5), RGBU(0xA6), RGBU(0xA7), + RGBU(0xA8), RGBU(0xA9), RGBU(0xAA), RGBU(0xAB), + RGBU(0xAC), RGBU(0xAD), RGBU(0xAE), RGBU(0xAF), + RGBU(0xB0), RGBU(0xB1), RGBU(0xB2), RGBU(0xB3), + RGBU(0xB4), RGBU(0xB5), RGBU(0xB6), RGBU(0xB7), + RGBU(0xB8), RGBU(0xB9), RGBU(0xBA), RGBU(0xBB), + RGBU(0xBC), RGBU(0xBD), RGBU(0xBE), RGBU(0xBF), + RGBU(0xC0), RGBU(0xC1), RGBU(0xC2), RGBU(0xC3), + RGBU(0xC4), RGBU(0xC5), RGBU(0xC6), RGBU(0xC7), + RGBU(0xC8), RGBU(0xC9), RGBU(0xCA), RGBU(0xCB), + RGBU(0xCC), RGBU(0xCD), RGBU(0xCE), RGBU(0xCF), + RGBU(0xD0), RGBU(0xD1), RGBU(0xD2), RGBU(0xD3), + RGBU(0xD4), RGBU(0xD5), RGBU(0xD6), RGBU(0xD7), + RGBU(0xD8), RGBU(0xD9), RGBU(0xDA), RGBU(0xDB), + RGBU(0xDC), RGBU(0xDD), RGBU(0xDE), RGBU(0xDF), + RGBU(0xE0), RGBU(0xE1), RGBU(0xE2), RGBU(0xE3), + RGBU(0xE4), RGBU(0xE5), RGBU(0xE6), RGBU(0xE7), + RGBU(0xE8), RGBU(0xE9), RGBU(0xEA), RGBU(0xEB), + RGBU(0xEC), RGBU(0xED), RGBU(0xEE), RGBU(0xEF), + RGBU(0xF0), RGBU(0xF1), RGBU(0xF2), RGBU(0xF3), + RGBU(0xF4), RGBU(0xF5), RGBU(0xF6), RGBU(0xF7), + RGBU(0xF8), RGBU(0xF9), RGBU(0xFA), RGBU(0xFB), + RGBU(0xFC), RGBU(0xFD), RGBU(0xFE), RGBU(0xFF), + + // Chroma V table. + RGBV(0x00), RGBV(0x01), RGBV(0x02), RGBV(0x03), + RGBV(0x04), RGBV(0x05), RGBV(0x06), RGBV(0x07), + RGBV(0x08), RGBV(0x09), RGBV(0x0A), RGBV(0x0B), + RGBV(0x0C), RGBV(0x0D), RGBV(0x0E), RGBV(0x0F), + RGBV(0x10), RGBV(0x11), RGBV(0x12), RGBV(0x13), + RGBV(0x14), RGBV(0x15), RGBV(0x16), RGBV(0x17), + RGBV(0x18), RGBV(0x19), RGBV(0x1A), RGBV(0x1B), + RGBV(0x1C), RGBV(0x1D), RGBV(0x1E), RGBV(0x1F), + RGBV(0x20), RGBV(0x21), RGBV(0x22), RGBV(0x23), + RGBV(0x24), RGBV(0x25), RGBV(0x26), RGBV(0x27), + RGBV(0x28), RGBV(0x29), RGBV(0x2A), RGBV(0x2B), + RGBV(0x2C), RGBV(0x2D), RGBV(0x2E), RGBV(0x2F), + RGBV(0x30), RGBV(0x31), RGBV(0x32), RGBV(0x33), + RGBV(0x34), RGBV(0x35), RGBV(0x36), RGBV(0x37), + RGBV(0x38), RGBV(0x39), RGBV(0x3A), RGBV(0x3B), + RGBV(0x3C), RGBV(0x3D), RGBV(0x3E), RGBV(0x3F), + RGBV(0x40), RGBV(0x41), RGBV(0x42), RGBV(0x43), + RGBV(0x44), RGBV(0x45), RGBV(0x46), RGBV(0x47), + RGBV(0x48), RGBV(0x49), RGBV(0x4A), RGBV(0x4B), + RGBV(0x4C), RGBV(0x4D), RGBV(0x4E), RGBV(0x4F), + RGBV(0x50), RGBV(0x51), RGBV(0x52), RGBV(0x53), + RGBV(0x54), RGBV(0x55), RGBV(0x56), RGBV(0x57), + RGBV(0x58), RGBV(0x59), RGBV(0x5A), RGBV(0x5B), + RGBV(0x5C), RGBV(0x5D), RGBV(0x5E), RGBV(0x5F), + RGBV(0x60), RGBV(0x61), RGBV(0x62), RGBV(0x63), + RGBV(0x64), RGBV(0x65), RGBV(0x66), RGBV(0x67), + RGBV(0x68), RGBV(0x69), RGBV(0x6A), RGBV(0x6B), + RGBV(0x6C), RGBV(0x6D), RGBV(0x6E), RGBV(0x6F), + RGBV(0x70), RGBV(0x71), RGBV(0x72), RGBV(0x73), + RGBV(0x74), RGBV(0x75), RGBV(0x76), RGBV(0x77), + RGBV(0x78), RGBV(0x79), RGBV(0x7A), RGBV(0x7B), + RGBV(0x7C), RGBV(0x7D), RGBV(0x7E), RGBV(0x7F), + RGBV(0x80), RGBV(0x81), RGBV(0x82), RGBV(0x83), + RGBV(0x84), RGBV(0x85), RGBV(0x86), RGBV(0x87), + RGBV(0x88), RGBV(0x89), RGBV(0x8A), RGBV(0x8B), + RGBV(0x8C), RGBV(0x8D), RGBV(0x8E), RGBV(0x8F), + RGBV(0x90), RGBV(0x91), RGBV(0x92), RGBV(0x93), + RGBV(0x94), RGBV(0x95), RGBV(0x96), RGBV(0x97), + RGBV(0x98), RGBV(0x99), RGBV(0x9A), RGBV(0x9B), + RGBV(0x9C), RGBV(0x9D), RGBV(0x9E), RGBV(0x9F), + RGBV(0xA0), RGBV(0xA1), RGBV(0xA2), RGBV(0xA3), + RGBV(0xA4), RGBV(0xA5), RGBV(0xA6), RGBV(0xA7), + RGBV(0xA8), RGBV(0xA9), RGBV(0xAA), RGBV(0xAB), + RGBV(0xAC), RGBV(0xAD), RGBV(0xAE), RGBV(0xAF), + RGBV(0xB0), RGBV(0xB1), RGBV(0xB2), RGBV(0xB3), + RGBV(0xB4), RGBV(0xB5), RGBV(0xB6), RGBV(0xB7), + RGBV(0xB8), RGBV(0xB9), RGBV(0xBA), RGBV(0xBB), + RGBV(0xBC), RGBV(0xBD), RGBV(0xBE), RGBV(0xBF), + RGBV(0xC0), RGBV(0xC1), RGBV(0xC2), RGBV(0xC3), + RGBV(0xC4), RGBV(0xC5), RGBV(0xC6), RGBV(0xC7), + RGBV(0xC8), RGBV(0xC9), RGBV(0xCA), RGBV(0xCB), + RGBV(0xCC), RGBV(0xCD), RGBV(0xCE), RGBV(0xCF), + RGBV(0xD0), RGBV(0xD1), RGBV(0xD2), RGBV(0xD3), + RGBV(0xD4), RGBV(0xD5), RGBV(0xD6), RGBV(0xD7), + RGBV(0xD8), RGBV(0xD9), RGBV(0xDA), RGBV(0xDB), + RGBV(0xDC), RGBV(0xDD), RGBV(0xDE), RGBV(0xDF), + RGBV(0xE0), RGBV(0xE1), RGBV(0xE2), RGBV(0xE3), + RGBV(0xE4), RGBV(0xE5), RGBV(0xE6), RGBV(0xE7), + RGBV(0xE8), RGBV(0xE9), RGBV(0xEA), RGBV(0xEB), + RGBV(0xEC), RGBV(0xED), RGBV(0xEE), RGBV(0xEF), + RGBV(0xF0), RGBV(0xF1), RGBV(0xF2), RGBV(0xF3), + RGBV(0xF4), RGBV(0xF5), RGBV(0xF6), RGBV(0xF7), + RGBV(0xF8), RGBV(0xF9), RGBV(0xFA), RGBV(0xFB), + RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF), +}; + +#undef RGBY +#undef RGBU +#undef RGBV + +} // extern "C" diff --git a/source/scale.cc b/source/scale.cc new file mode 100644 index 000000000..da4a9c6cc --- /dev/null +++ b/source/scale.cc @@ -0,0 +1,2848 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "scale.h" + +#include +#include "common.h" + +#include "cpu_id.h" + +// Note: A Neon reference manual +// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html +// Note: Some SSE2 reference manuals +// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf + +// TODO(fbarchard): Remove once performance is known +//#define TEST_RSTSC + +#if defined(TEST_RSTSC) +#include +#include +#ifdef _MSC_VER +#include +#endif + +#if defined(__GNUC__) && defined(__i386__) +static inline uint64 __rdtsc(void) { + uint32_t a, d; + __asm__ volatile("rdtsc" : "=a" (a), "=d" (d)); + return ((uint64)d << 32) + a; +} +#endif +#endif + +namespace libyuv { + +// Set the following flag to true to revert to only +// using the reference implementation ScalePlaneBox(), and +// NOT the optimized versions. Useful for debugging and +// when comparing the quality of the resulting YUV planes +// as produced by the optimized and non-optimized versions. +bool YuvScaler::use_reference_impl_ = false; + + +/** + * NEON downscalers with interpolation. + * + * Provided by Fritz Koenig + * + */ + +#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED) +#define HAS_SCALEROWDOWN2_NEON +void ScaleRowDown2_NEON(const uint8* iptr, int32 /* istride */, + uint8* dst, int32 owidth) { + __asm__ volatile + ( + "1:\n" + "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1 + "vst1.u8 {q0}, [%1]! \n" // store even pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "bhi 1b \n" + : // Output registers + : "r"(iptr), "r"(dst), "r"(owidth) // Input registers + : "r4", "q0", "q1" // Clobber List + ); +} + +void ScaleRowDown2Int_NEON(const uint8* iptr, int32 istride, + uint8* dst, int32 owidth) { + __asm__ volatile + ( + "mov r4, #2 \n" // rounding constant + "add %1, %0 \n" // l2 + "vdup.16 q4, r4 \n" + "1:\n" + "vld1.u8 {q0,q1}, [%0]! \n" // load l1 and post increment + "vld1.u8 {q2,q3}, [%1]! \n" // load l2 and post increment + "vpaddl.u8 q0, q0 \n" // l1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // l2 add adjacent and add l1 to l2 + "vpadal.u8 q1, q3 \n" + "vadd.u16 q0, q4 \n" // rounding + "vadd.u16 q1, q4 \n" + "vshrn.u16 d0, q0, #2 \n" // downshift and pack + "vshrn.u16 d1, q1, #2 \n" + "vst1.u8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" // 16 processed per loop + "bhi 1b \n" + : // Output registers + : "r"(iptr), "r"(istride), "r"(dst), "r"(owidth) // Input registers + : "r4", "q0", "q1", "q2", "q3", "q4" // Clobber List + ); +} + +/** + * SSE2 downscalers with interpolation. + * + * Provided by Frank Barchard (fbarchard@google.com) + * + */ + +// Constants for SSE2 code +#elif (defined(WIN32) || defined(__i386__)) && !defined(COVERAGE_ENABLED) && \ + !defined(__PIC__) && !TARGET_IPHONE_SIMULATOR +#if defined(_MSC_VER) +#define TALIGN16(t, var) __declspec(align(16)) t _ ## var +#elif defined(OSX) +#define TALIGN16(t, var) t var __attribute__((aligned(16))) +#else +#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16))) +#endif + +// Offsets for source bytes 0 to 9 +extern "C" TALIGN16(const uint8, shuf0[16]) = + { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +extern "C" TALIGN16(const uint8, shuf1[16]) = + { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +extern "C" TALIGN16(const uint8, shuf2[16]) = + { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 0 to 10 +extern "C" TALIGN16(const uint8, shuf01[16]) = + { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +extern "C" TALIGN16(const uint8, shuf11[16]) = + { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +extern "C" TALIGN16(const uint8, shuf21[16]) = + { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; + +// Coefficients for source bytes 0 to 10 +extern "C" TALIGN16(const uint8, madd01[16]) = + { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; + +// Coefficients for source bytes 10 to 21 +extern "C" TALIGN16(const uint8, madd11[16]) = + { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; + +// Coefficients for source bytes 21 to 31 +extern "C" TALIGN16(const uint8, madd21[16]) = + { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; + +// Coefficients for source bytes 21 to 31 +extern "C" TALIGN16(const int16, round34[8]) = + { 2, 2, 2, 2, 2, 2, 2, 2 }; + +extern "C" TALIGN16(const uint8, shuf38a[16]) = + { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +extern "C" TALIGN16(const uint8, shuf38b[16]) = + { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 0,1,2 +extern "C" TALIGN16(const uint8, shufac0[16]) = + { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 3,4,5 +extern "C" TALIGN16(const uint8, shufac3[16]) = + { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x3 and 2x3 +extern "C" TALIGN16(const uint16, scaleac3[8]) = + { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; + +// Arrange first value for pixels 0,1,2,3,4,5 +extern "C" TALIGN16(const uint8, shufab0[16]) = + { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; + +// Arrange second value for pixels 0,1,2,3,4,5 +extern "C" TALIGN16(const uint8, shufab1[16]) = + { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; + +// Arrange third value for pixels 0,1,2,3,4,5 +extern "C" TALIGN16(const uint8, shufab2[16]) = + { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x2 and 2x2 +extern "C" TALIGN16(const uint16, scaleab2[8]) = + { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; +#endif + +#if defined(WIN32) && !defined(COVERAGE_ENABLED) + +#define HAS_SCALEROWDOWN2_SSE2 +// Reads 32 pixels, throws half away and writes 16 pixels. +// Alignment requirement: iptr 16 byte aligned, optr 16 byte aligned. +__declspec(naked) +static void ScaleRowDown2_SSE2(const uint8* iptr, int32 istride, + uint8* optr, int32 owidth) { + __asm { + mov eax, [esp + 4] // iptr + // istride ignored + mov edx, [esp + 12] // optr + mov ecx, [esp + 16] // owidth + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm7 + pand xmm1, xmm7 + packuswb xmm0, xmm1 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja wloop + + ret + } +} +// Blends 32x2 rectangle to 16x1. +// Alignment requirement: iptr 16 byte aligned, optr 16 byte aligned. +__declspec(naked) +static void ScaleRowDown2Int_SSE2(const uint8* iptr, int32 istride, + uint8* optr, int32 owidth) { + __asm { + push esi + mov eax, [esp + 4 + 4] // iptr + mov esi, [esp + 4 + 8] // istride + mov edx, [esp + 4 + 12] // optr + mov ecx, [esp + 4 + 16] // owidth + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm7 + pand xmm3, xmm7 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja wloop + + pop esi + ret + } +} + +#define HAS_SCALEROWDOWN4_SSE2 +// Point samples 32 pixels to 8 pixels. +// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown4_SSE2(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth) { + __asm { + pushad + mov esi, [esp + 32 + 4] // iptr + // istride ignored + mov edi, [esp + 32 + 12] // orow + mov ecx, [esp + 32 + 16] // owidth + pcmpeqb xmm7, xmm7 // generate mask 0x000000ff + psrld xmm7, 24 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + lea esi, [esi + 32] + pand xmm0, xmm7 + pand xmm1, xmm7 + packuswb xmm0, xmm1 + packuswb xmm0, xmm0 + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + sub ecx, 8 + ja wloop + + popad + ret + } +} + +// Blends 32x4 rectangle to 8x1. +// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown4Int_SSE2(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth) { + __asm { + pushad + mov esi, [esp + 32 + 4] // iptr + mov ebx, [esp + 32 + 8] // istride + mov edi, [esp + 32 + 12] // orow + mov ecx, [esp + 32 + 16] // owidth + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + lea edx, [ebx + ebx * 2] // istride * 3 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + movdqa xmm2, [esi + ebx] + movdqa xmm3, [esi + ebx + 16] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, [esi + ebx * 2] + movdqa xmm3, [esi + ebx * 2 + 16] + movdqa xmm4, [esi + edx] + movdqa xmm5, [esi + edx + 16] + lea esi, [esi + 32] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm7 + pand xmm3, xmm7 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + movdqa xmm2, xmm0 // average columns (16 to 8 pixels) + psrlw xmm0, 8 + pand xmm2, xmm7 + pavgw xmm0, xmm2 + packuswb xmm0, xmm0 + + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + sub ecx, 8 + ja wloop + + popad + ret + } +} + +#define HAS_SCALEROWDOWN8_SSE2 +// Point samples 32 pixels to 4 pixels. +// Alignment requirement: iptr 16 byte aligned, optr 4 byte aligned. +__declspec(naked) +static void ScaleRowDown8_SSE2(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth) { + __asm { + pushad + mov esi, [esp + 32 + 4] // iptr + // istride ignored + mov edi, [esp + 32 + 12] // orow + mov ecx, [esp + 32 + 16] // owidth + pcmpeqb xmm7, xmm7 // generate mask isolating 1 in 8 bytes + psrlq xmm7, 56 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + lea esi, [esi + 32] + pand xmm0, xmm7 + pand xmm1, xmm7 + packuswb xmm0, xmm1 // 32->16 + packuswb xmm0, xmm0 // 16->8 + packuswb xmm0, xmm0 // 8->4 + movd dword ptr [edi], xmm0 + lea edi, [edi + 4] + sub ecx, 4 + ja wloop + + popad + ret + } +} + +// Blends 32x8 rectangle to 4x1. +// Alignment requirement: iptr 16 byte aligned, optr 4 byte aligned. +__declspec(naked) +static void ScaleRowDown8Int_SSE2(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth) { + __asm { + pushad + mov esi, [esp + 32 + 4] // iptr + mov ebx, [esp + 32 + 8] // istride + mov edi, [esp + 32 + 12] // orow + mov ecx, [esp + 32 + 16] // owidth + lea edx, [ebx + ebx * 2] // istride * 3 + pxor xmm7, xmm7 + + wloop: + movdqa xmm0, [esi] // average 8 rows to 1 + movdqa xmm1, [esi + 16] + movdqa xmm2, [esi + ebx] + movdqa xmm3, [esi + ebx + 16] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + movdqa xmm2, [esi + ebx * 2] + movdqa xmm3, [esi + ebx * 2 + 16] + movdqa xmm4, [esi + edx] + movdqa xmm5, [esi + edx + 16] + lea ebp, [esi + ebx * 4] + lea esi, [esi + 32] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + movdqa xmm2, [ebp] + movdqa xmm3, [ebp + 16] + movdqa xmm4, [ebp + ebx] + movdqa xmm5, [ebp + ebx + 16] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + movdqa xmm4, [ebp + ebx * 2] + movdqa xmm5, [ebp + ebx * 2 + 16] + movdqa xmm6, [ebp + edx] + pavgb xmm4, xmm6 + movdqa xmm6, [ebp + edx + 16] + pavgb xmm5, xmm6 + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + psadbw xmm0, xmm7 // average 32 pixels to 4 + psadbw xmm1, xmm7 + pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01 + pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx + por xmm0, xmm1 // -> 3201 + psrlw xmm0, 3 + packuswb xmm0, xmm0 + packuswb xmm0, xmm0 + movd dword ptr [edi], xmm0 + + lea edi, [edi + 4] + sub ecx, 4 + ja wloop + + popad + ret + } +} + +#define HAS_SCALEROWDOWN34_SSSE3 +// Point samples 32 pixels to 24 pixels. +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown34_SSSE3(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth) { + __asm { + pushad + mov esi, [esp + 32 + 4] // iptr + // istride ignored + mov edi, [esp + 32 + 12] // orow + mov ecx, [esp + 32 + 16] // owidth + movdqa xmm3, _shuf0 + movdqa xmm4, _shuf1 + movdqa xmm5, _shuf2 + + wloop: + movdqa xmm0, [esi] + movdqa xmm2, [esi + 16] + lea esi, [esi + 32] + movdqa xmm1, xmm2 + palignr xmm1, xmm0, 8 + pshufb xmm0, xmm3 + pshufb xmm1, xmm4 + pshufb xmm2, xmm5 + movq qword ptr [edi], xmm0 + movq qword ptr [edi + 8], xmm1 + movq qword ptr [edi + 16], xmm2 + lea edi, [edi + 24] + sub ecx, 24 + ja wloop + + popad + ret + } +} + +// Blends 32x2 rectangle to 24x1 +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Register usage: +// xmm0 irow 0 +// xmm1 irow 1 +// xmm2 shuf 0 +// xmm3 shuf 1 +// xmm4 shuf 2 +// xmm5 madd 0 +// xmm6 madd 1 +// xmm7 round34 + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown34_1_Int_SSSE3(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth) { + __asm { + pushad + mov esi, [esp + 32 + 4] // iptr + mov ebx, [esp + 32 + 8] // istride + mov edi, [esp + 32 + 12] // orow + mov ecx, [esp + 32 + 16] // owidth + movdqa xmm2, _shuf01 + movdqa xmm3, _shuf11 + movdqa xmm4, _shuf21 + movdqa xmm5, _madd01 + movdqa xmm6, _madd11 + movdqa xmm7, _round34 + + wloop: + movdqa xmm0, [esi] // pixels 0..7 + movdqa xmm1, [esi+ebx] + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi], xmm0 + movdqu xmm0, [esi+8] // pixels 8..15 + movdqu xmm1, [esi+ebx+8] + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+8], xmm0 + movdqa xmm0, [esi+16] // pixels 16..23 + movdqa xmm1, [esi+ebx+16] + lea esi, [esi+32] + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, _madd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+16], xmm0 + lea edi, [edi+24] + sub ecx, 24 + ja wloop + + popad + ret + } +} + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown34_0_Int_SSSE3(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth) { + __asm { + pushad + mov esi, [esp + 32 + 4] // iptr + mov ebx, [esp + 32 + 8] // istride + mov edi, [esp + 32 + 12] // orow + mov ecx, [esp + 32 + 16] // owidth + movdqa xmm2, _shuf01 + movdqa xmm3, _shuf11 + movdqa xmm4, _shuf21 + movdqa xmm5, _madd01 + movdqa xmm6, _madd11 + movdqa xmm7, _round34 + + wloop: + movdqa xmm0, [esi] // pixels 0..7 + movdqa xmm1, [esi+ebx] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi], xmm0 + movdqu xmm0, [esi+8] // pixels 8..15 + movdqu xmm1, [esi+ebx+8] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+8], xmm0 + movdqa xmm0, [esi+16] // pixels 16..23 + movdqa xmm1, [esi+ebx+16] + lea esi, [esi+32] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, _madd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+16], xmm0 + lea edi, [edi+24] + sub ecx, 24 + ja wloop + + popad + ret + } +} + +#define HAS_SCALEROWDOWN38_SSSE3 +// 3/8 point sampler + +// Scale 32 pixels to 12 +__declspec(naked) +static void ScaleRowDown38_SSSE3(const uint8* iptr, int32 istride, + uint8* optr, int32 owidth) { + __asm { + pushad + mov esi, [esp + 32 + 4] // iptr + mov edx, [esp + 32 + 8] // istride + mov edi, [esp + 32 + 12] // optr + mov ecx, [esp + 32 + 16] // owidth + movdqa xmm5, _shuf38a + movdqa xmm6, _shuf38b + pxor xmm7, xmm7 + + xloop: + movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5 + movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11 + lea esi, [esi + 32] + pshufb xmm0, xmm5 + pshufb xmm1, xmm6 + paddusb xmm0, xmm1 + + movq qword ptr [edi], xmm0 // write 12 pixels + movhlps xmm1, xmm0 + movd [edi + 8], xmm1 + lea edi, [edi + 12] + sub ecx, 12 + ja xloop + + popad + ret + } +} + +// Scale 16x3 pixels to 6x1 with interpolation +__declspec(naked) +static void ScaleRowDown38_3_Int_SSSE3(const uint8* iptr, int32 istride, + uint8* optr, int32 owidth) { + __asm { + pushad + mov esi, [esp + 32 + 4] // iptr + mov edx, [esp + 32 + 8] // istride + mov edi, [esp + 32 + 12] // optr + mov ecx, [esp + 32 + 16] // owidth + movdqa xmm4, _shufac0 + movdqa xmm5, _shufac3 + movdqa xmm6, _scaleac3 + pxor xmm7, xmm7 + + xloop: + movdqa xmm0, [esi] // sum up 3 rows into xmm0/1 + movdqa xmm2, [esi + edx] + movhlps xmm1, xmm0 + movhlps xmm3, xmm2 + punpcklbw xmm0, xmm7 + punpcklbw xmm1, xmm7 + punpcklbw xmm2, xmm7 + punpcklbw xmm3, xmm7 + paddusw xmm0, xmm2 + paddusw xmm1, xmm3 + movdqa xmm2, [esi + edx * 2] + lea esi, [esi + 16] + movhlps xmm3, xmm2 + punpcklbw xmm2, xmm7 + punpcklbw xmm3, xmm7 + paddusw xmm0, xmm2 + paddusw xmm1, xmm3 + + movdqa xmm2, xmm0 // 8 pixels -> 0,1,2 of xmm2 + psrldq xmm0, 2 + paddusw xmm2, xmm0 + psrldq xmm0, 2 + paddusw xmm2, xmm0 + pshufb xmm2, xmm4 + + movdqa xmm3, xmm1 // 8 pixels -> 3,4,5 of xmm2 + psrldq xmm1, 2 + paddusw xmm3, xmm1 + psrldq xmm1, 2 + paddusw xmm3, xmm1 + pshufb xmm3, xmm5 + paddusw xmm2, xmm3 + + pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6 + packuswb xmm2, xmm2 + + movd [edi], xmm2 // write 6 pixels + pextrw eax, xmm2, 2 + mov [edi + 4], ax + lea edi, [edi + 6] + sub ecx, 6 + ja xloop + + popad + ret + } +} + +// Scale 16x2 pixels to 6x1 with interpolation +__declspec(naked) +static void ScaleRowDown38_2_Int_SSSE3(const uint8* iptr, int32 istride, + uint8* optr, int32 owidth) { + __asm { + pushad + mov esi, [esp + 32 + 4] // iptr + mov edx, [esp + 32 + 8] // istride + mov edi, [esp + 32 + 12] // optr + mov ecx, [esp + 32 + 16] // owidth + movdqa xmm4, _shufab0 + movdqa xmm5, _shufab1 + movdqa xmm6, _shufab2 + movdqa xmm7, _scaleab2 + + xloop: + movdqa xmm2, [esi] // average 2 rows into xmm2 + pavgb xmm2, [esi + edx] + lea esi, [esi + 16] + + movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0 + pshufb xmm0, xmm4 + movdqa xmm1, xmm2 + pshufb xmm1, xmm5 + paddusw xmm0, xmm1 + pshufb xmm2, xmm6 + paddusw xmm0, xmm2 + + pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2 + packuswb xmm0, xmm0 + + movd [edi], xmm0 // write 6 pixels + pextrw eax, xmm0, 2 + mov [edi + 4], ax + lea edi, [edi + 6] + sub ecx, 6 + ja xloop + + popad + ret + } +} + +#define HAS_SCALEADDROWS_SSE2 + +// Reads 8xN bytes and produces 16 shorts at a time. +__declspec(naked) +static void ScaleAddRows_SSE2(const uint8* iptr, int32 istride, + uint16* orow, int32 iwidth, int32 iheight) { + __asm { + pushad + mov esi, [esp + 32 + 4] // iptr + mov edx, [esp + 32 + 8] // istride + mov edi, [esp + 32 + 12] // orow + mov ecx, [esp + 32 + 16] // owidth + mov ebx, [esp + 32 + 20] // height + pxor xmm7, xmm7 + dec ebx + + xloop: + // first row + movdqa xmm2, [esi] + lea eax, [esi + edx] + movhlps xmm3, xmm2 + mov ebp, ebx + punpcklbw xmm2, xmm7 + punpcklbw xmm3, xmm7 + + // sum remaining rows + yloop: + movdqa xmm0, [eax] // read 16 pixels + lea eax, [eax + edx] // advance to next row + movhlps xmm1, xmm0 + punpcklbw xmm0, xmm7 + punpcklbw xmm1, xmm7 + paddusw xmm2, xmm0 // sum 16 words + paddusw xmm3, xmm1 + sub ebp, 1 + ja yloop + + movdqa [edi], xmm2 + movdqa [edi + 16], xmm3 + lea edi, [edi + 32] + lea esi, [esi + 16] + + sub ecx, 16 + ja xloop + + popad + ret + } +} + +// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version. +#define HAS_SCALEFILTERROWS_SSE2 +__declspec(naked) +static void ScaleFilterRows_SSE2(uint8* optr, const uint8* iptr0, int32 istride, + int owidth, int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // optr + mov esi, [esp + 8 + 8] // iptr0 + mov edx, [esp + 8 + 12] // istride + mov ecx, [esp + 8 + 16] // owidth + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + cmp eax, 0 + je xloop1 + cmp eax, 128 + je xloop2 + + movd xmm6, eax // xmm6 = y fraction + punpcklwd xmm6, xmm6 + pshufd xmm6, xmm6, 0 + neg eax // xmm5 = 256 - y fraction + add eax, 256 + movd xmm5, eax + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + pxor xmm7, xmm7 + + xloop: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + punpcklbw xmm0, xmm7 + punpcklbw xmm2, xmm7 + punpckhbw xmm1, xmm7 + punpckhbw xmm3, xmm7 + pmullw xmm0, xmm5 // scale row 0 + pmullw xmm1, xmm5 + pmullw xmm2, xmm6 // scale row 1 + pmullw xmm3, xmm6 + paddusw xmm0, xmm2 // sum rows + paddusw xmm1, xmm3 + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop1: + movdqa xmm0, [esi] + lea esi, [esi + 16] + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop1 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop2: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + pavgb xmm0, xmm2 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop2 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + } +} + +// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version. +#define HAS_SCALEFILTERROWS_SSSE3 +__declspec(naked) +static void ScaleFilterRows_SSSE3(uint8* optr, const uint8* iptr0, int32 istride, + int owidth, int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // optr + mov esi, [esp + 8 + 8] // iptr0 + mov edx, [esp + 8 + 12] // istride + mov ecx, [esp + 8 + 16] // owidth + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + cmp eax, 0 + je xloop1 + cmp eax, 128 + je xloop2 + + shr eax, 1 + mov ah,al + neg al + add al, 128 + movd xmm7, eax + punpcklwd xmm7, xmm7 + pshufd xmm7, xmm7, 0 + + xloop: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddubsw xmm0, xmm7 + pmaddubsw xmm1, xmm7 + psrlw xmm0, 7 + psrlw xmm1, 7 + packuswb xmm0, xmm1 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop1: + movdqa xmm0, [esi] + lea esi, [esi + 16] + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop1 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop2: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + pavgb xmm0, xmm2 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop2 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + } +} + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned. +__declspec(naked) +static void ScaleFilterCols34_SSSE3(uint8* optr, const uint8* iptr, + int owidth) { + __asm { + mov edx, [esp + 4] // optr + mov eax, [esp + 8] // iptr + mov ecx, [esp + 12] // owidth + movdqa xmm1, _round34 + movdqa xmm2, _shuf01 + movdqa xmm3, _shuf11 + movdqa xmm4, _shuf21 + movdqa xmm5, _madd01 + movdqa xmm6, _madd11 + movdqa xmm7, _madd21 + + wloop: + movdqa xmm0, [eax] // pixels 0..7 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm1 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax+8] // pixels 8..15 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm1 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx+8], xmm0 + movdqa xmm0, [eax+16] // pixels 16..23 + lea eax, [eax+32] + pshufb xmm0, xmm4 + pmaddubsw xmm0, xmm7 + paddsw xmm0, xmm1 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx+16], xmm0 + lea edx, [edx+24] + sub ecx, 24 + ja wloop + ret + } +} + +#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \ + !TARGET_IPHONE_SIMULATOR + +// GCC versions of row functions are verbatim conversions from Visual C. +// Generated using gcc disassembly on Visual C object file: +// objdump -D yuvscaler.obj >yuvscaler.txt +#define HAS_SCALEROWDOWN2_SSE2 +extern "C" void ScaleRowDown2_SSE2(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown2_SSE2\n" +"_ScaleRowDown2_SSE2:\n" +#else + ".global ScaleRowDown2_SSE2\n" +"ScaleRowDown2_SSE2:\n" +#endif + "mov 0x4(%esp),%eax\n" + "mov 0xc(%esp),%edx\n" + "mov 0x10(%esp),%ecx\n" + "pcmpeqb %xmm7,%xmm7\n" + "psrlw $0x8,%xmm7\n" + +"1:" + "movdqa (%eax),%xmm0\n" + "movdqa 0x10(%eax),%xmm1\n" + "lea 0x20(%eax),%eax\n" + "pand %xmm7,%xmm0\n" + "pand %xmm7,%xmm1\n" + "packuswb %xmm1,%xmm0\n" + "movdqa %xmm0,(%edx)\n" + "lea 0x10(%edx),%edx\n" + "sub $0x10,%ecx\n" + "ja 1b\n" + "ret\n" +); + +extern "C" void ScaleRowDown2Int_SSE2(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown2Int_SSE2\n" +"_ScaleRowDown2Int_SSE2:\n" +#else + ".global ScaleRowDown2Int_SSE2\n" +"ScaleRowDown2Int_SSE2:\n" +#endif + "push %esi\n" + "mov 0x8(%esp),%eax\n" + "mov 0xc(%esp),%esi\n" + "mov 0x10(%esp),%edx\n" + "mov 0x14(%esp),%ecx\n" + "pcmpeqb %xmm7,%xmm7\n" + "psrlw $0x8,%xmm7\n" + +"1:" + "movdqa (%eax),%xmm0\n" + "movdqa 0x10(%eax),%xmm1\n" + "movdqa (%eax,%esi,1),%xmm2\n" + "movdqa 0x10(%eax,%esi,1),%xmm3\n" + "lea 0x20(%eax),%eax\n" + "pavgb %xmm2,%xmm0\n" + "pavgb %xmm3,%xmm1\n" + "movdqa %xmm0,%xmm2\n" + "psrlw $0x8,%xmm0\n" + "movdqa %xmm1,%xmm3\n" + "psrlw $0x8,%xmm1\n" + "pand %xmm7,%xmm2\n" + "pand %xmm7,%xmm3\n" + "pavgw %xmm2,%xmm0\n" + "pavgw %xmm3,%xmm1\n" + "packuswb %xmm1,%xmm0\n" + "movdqa %xmm0,(%edx)\n" + "lea 0x10(%edx),%edx\n" + "sub $0x10,%ecx\n" + "ja 1b\n" + "pop %esi\n" + "ret\n" +); + +#define HAS_SCALEROWDOWN4_SSE2 +extern "C" void ScaleRowDown4_SSE2(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown4_SSE2\n" +"_ScaleRowDown4_SSE2:\n" +#else + ".global ScaleRowDown4_SSE2\n" +"ScaleRowDown4_SSE2:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "pcmpeqb %xmm7,%xmm7\n" + "psrld $0x18,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa 0x10(%esi),%xmm1\n" + "lea 0x20(%esi),%esi\n" + "pand %xmm7,%xmm0\n" + "pand %xmm7,%xmm1\n" + "packuswb %xmm1,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movq %xmm0,(%edi)\n" + "lea 0x8(%edi),%edi\n" + "sub $0x8,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +extern "C" void ScaleRowDown4Int_SSE2(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown4Int_SSE2\n" +"_ScaleRowDown4Int_SSE2:\n" +#else + ".global ScaleRowDown4Int_SSE2\n" +"ScaleRowDown4Int_SSE2:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x28(%esp),%ebx\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "pcmpeqb %xmm7,%xmm7\n" + "psrlw $0x8,%xmm7\n" + "lea (%ebx,%ebx,2),%edx\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa 0x10(%esi),%xmm1\n" + "movdqa (%esi,%ebx,1),%xmm2\n" + "movdqa 0x10(%esi,%ebx,1),%xmm3\n" + "pavgb %xmm2,%xmm0\n" + "pavgb %xmm3,%xmm1\n" + "movdqa (%esi,%ebx,2),%xmm2\n" + "movdqa 0x10(%esi,%ebx,2),%xmm3\n" + "movdqa (%esi,%edx,1),%xmm4\n" + "movdqa 0x10(%esi,%edx,1),%xmm5\n" + "lea 0x20(%esi),%esi\n" + "pavgb %xmm4,%xmm2\n" + "pavgb %xmm2,%xmm0\n" + "pavgb %xmm5,%xmm3\n" + "pavgb %xmm3,%xmm1\n" + "movdqa %xmm0,%xmm2\n" + "psrlw $0x8,%xmm0\n" + "movdqa %xmm1,%xmm3\n" + "psrlw $0x8,%xmm1\n" + "pand %xmm7,%xmm2\n" + "pand %xmm7,%xmm3\n" + "pavgw %xmm2,%xmm0\n" + "pavgw %xmm3,%xmm1\n" + "packuswb %xmm1,%xmm0\n" + "movdqa %xmm0,%xmm2\n" + "psrlw $0x8,%xmm0\n" + "pand %xmm7,%xmm2\n" + "pavgw %xmm2,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movq %xmm0,(%edi)\n" + "lea 0x8(%edi),%edi\n" + "sub $0x8,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +#define HAS_SCALEROWDOWN8_SSE2 +extern "C" void ScaleRowDown8_SSE2(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown8_SSE2\n" +"_ScaleRowDown8_SSE2:\n" +#else + ".global ScaleRowDown8_SSE2\n" +"ScaleRowDown8_SSE2:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "pcmpeqb %xmm7,%xmm7\n" + "psrlq $0x38,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa 0x10(%esi),%xmm1\n" + "lea 0x20(%esi),%esi\n" + "pand %xmm7,%xmm0\n" + "pand %xmm7,%xmm1\n" + "packuswb %xmm1,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movd %xmm0,(%edi)\n" + "lea 0x4(%edi),%edi\n" + "sub $0x4,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +extern "C" void ScaleRowDown8Int_SSE2(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown8Int_SSE2\n" +"_ScaleRowDown8Int_SSE2:\n" +#else + ".global ScaleRowDown8Int_SSE2\n" +"ScaleRowDown8Int_SSE2:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x28(%esp),%ebx\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "lea (%ebx,%ebx,2),%edx\n" + "pxor %xmm7,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa 0x10(%esi),%xmm1\n" + "movdqa (%esi,%ebx,1),%xmm2\n" + "movdqa 0x10(%esi,%ebx,1),%xmm3\n" + "pavgb %xmm2,%xmm0\n" + "pavgb %xmm3,%xmm1\n" + "movdqa (%esi,%ebx,2),%xmm2\n" + "movdqa 0x10(%esi,%ebx,2),%xmm3\n" + "movdqa (%esi,%edx,1),%xmm4\n" + "movdqa 0x10(%esi,%edx,1),%xmm5\n" + "lea (%esi,%ebx,4),%ebp\n" + "lea 0x20(%esi),%esi\n" + "pavgb %xmm4,%xmm2\n" + "pavgb %xmm5,%xmm3\n" + "pavgb %xmm2,%xmm0\n" + "pavgb %xmm3,%xmm1\n" + "movdqa 0x0(%ebp),%xmm2\n" + "movdqa 0x10(%ebp),%xmm3\n" + "movdqa 0x0(%ebp,%ebx,1),%xmm4\n" + "movdqa 0x10(%ebp,%ebx,1),%xmm5\n" + "pavgb %xmm4,%xmm2\n" + "pavgb %xmm5,%xmm3\n" + "movdqa 0x0(%ebp,%ebx,2),%xmm4\n" + "movdqa 0x10(%ebp,%ebx,2),%xmm5\n" + "movdqa 0x0(%ebp,%edx,1),%xmm6\n" + "pavgb %xmm6,%xmm4\n" + "movdqa 0x10(%ebp,%edx,1),%xmm6\n" + "pavgb %xmm6,%xmm5\n" + "pavgb %xmm4,%xmm2\n" + "pavgb %xmm5,%xmm3\n" + "pavgb %xmm2,%xmm0\n" + "pavgb %xmm3,%xmm1\n" + "psadbw %xmm7,%xmm0\n" + "psadbw %xmm7,%xmm1\n" + "pshufd $0xd8,%xmm0,%xmm0\n" + "pshufd $0x8d,%xmm1,%xmm1\n" + "por %xmm1,%xmm0\n" + "psrlw $0x3,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movd %xmm0,(%edi)\n" + "lea 0x4(%edi),%edi\n" + "sub $0x4,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +// fpic is used for magiccam plugin +#if !defined(__PIC__) +#define HAS_SCALEROWDOWN34_SSSE3 +extern "C" void ScaleRowDown34_SSSE3(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown34_SSSE3\n" +"_ScaleRowDown34_SSSE3:\n" +#else + ".global ScaleRowDown34_SSSE3\n" +"ScaleRowDown34_SSSE3:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "movdqa _shuf0,%xmm3\n" + "movdqa _shuf1,%xmm4\n" + "movdqa _shuf2,%xmm5\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa 0x10(%esi),%xmm2\n" + "lea 0x20(%esi),%esi\n" + "movdqa %xmm2,%xmm1\n" + "palignr $0x8,%xmm0,%xmm1\n" + "pshufb %xmm3,%xmm0\n" + "pshufb %xmm4,%xmm1\n" + "pshufb %xmm5,%xmm2\n" + "movq %xmm0,(%edi)\n" + "movq %xmm1,0x8(%edi)\n" + "movq %xmm2,0x10(%edi)\n" + "lea 0x18(%edi),%edi\n" + "sub $0x18,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown34_1_Int_SSSE3\n" +"_ScaleRowDown34_1_Int_SSSE3:\n" +#else + ".global ScaleRowDown34_1_Int_SSSE3\n" +"ScaleRowDown34_1_Int_SSSE3:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x28(%esp),%ebp\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "movdqa _shuf01,%xmm2\n" + "movdqa _shuf11,%xmm3\n" + "movdqa _shuf21,%xmm4\n" + "movdqa _madd01,%xmm5\n" + "movdqa _madd11,%xmm6\n" + "movdqa _round34,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa (%esi,%ebp),%xmm1\n" + "pavgb %xmm1,%xmm0\n" + "pshufb %xmm2,%xmm0\n" + "pmaddubsw %xmm5,%xmm0\n" + "paddsw %xmm7,%xmm0\n" + "psrlw $0x2,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movq %xmm0,(%edi)\n" + "movdqu 0x8(%esi),%xmm0\n" + "movdqu 0x8(%esi,%ebp),%xmm1\n" + "pavgb %xmm1,%xmm0\n" + "pshufb %xmm3,%xmm0\n" + "pmaddubsw %xmm6,%xmm0\n" + "paddsw %xmm7,%xmm0\n" + "psrlw $0x2,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movq %xmm0,0x8(%edi)\n" + "movdqa 0x10(%esi),%xmm0\n" + "movdqa 0x10(%esi,%ebp),%xmm1\n" + "lea 0x20(%esi),%esi\n" + "pavgb %xmm1,%xmm0\n" + "pshufb %xmm4,%xmm0\n" + "movdqa _madd21,%xmm1\n" + "pmaddubsw %xmm1,%xmm0\n" + "paddsw %xmm7,%xmm0\n" + "psrlw $0x2,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movq %xmm0,0x10(%edi)\n" + "lea 0x18(%edi),%edi\n" + "sub $0x18,%ecx\n" + "ja 1b\n" + + "popa\n" + "ret\n" +); + +extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown34_0_Int_SSSE3\n" +"_ScaleRowDown34_0_Int_SSSE3:\n" +#else + ".global ScaleRowDown34_0_Int_SSSE3\n" +"ScaleRowDown34_0_Int_SSSE3:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x28(%esp),%ebp\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "movdqa _shuf01,%xmm2\n" + "movdqa _shuf11,%xmm3\n" + "movdqa _shuf21,%xmm4\n" + "movdqa _madd01,%xmm5\n" + "movdqa _madd11,%xmm6\n" + "movdqa _round34,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa (%esi,%ebp,1),%xmm1\n" + "pavgb %xmm0,%xmm1\n" + "pavgb %xmm1,%xmm0\n" + "pshufb %xmm2,%xmm0\n" + "pmaddubsw %xmm5,%xmm0\n" + "paddsw %xmm7,%xmm0\n" + "psrlw $0x2,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movq %xmm0,(%edi)\n" + "movdqu 0x8(%esi),%xmm0\n" + "movdqu 0x8(%esi,%ebp,1),%xmm1\n" + "pavgb %xmm0,%xmm1\n" + "pavgb %xmm1,%xmm0\n" + "pshufb %xmm3,%xmm0\n" + "pmaddubsw %xmm6,%xmm0\n" + "paddsw %xmm7,%xmm0\n" + "psrlw $0x2,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movq %xmm0,0x8(%edi)\n" + "movdqa 0x10(%esi),%xmm0\n" + "movdqa 0x10(%esi,%ebp,1),%xmm1\n" + "lea 0x20(%esi),%esi\n" + "pavgb %xmm0,%xmm1\n" + "pavgb %xmm1,%xmm0\n" + "pshufb %xmm4,%xmm0\n" + "movdqa _madd21,%xmm1\n" + "pmaddubsw %xmm1,%xmm0\n" + "paddsw %xmm7,%xmm0\n" + "psrlw $0x2,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movq %xmm0,0x10(%edi)\n" + "lea 0x18(%edi),%edi\n" + "sub $0x18,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +#define HAS_SCALEROWDOWN38_SSSE3 +extern "C" void ScaleRowDown38_SSSE3(const uint8* iptr, int32 istride, + uint8* optr, int32 owidth); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown38_SSSE3\n" +"_ScaleRowDown38_SSSE3:\n" +#else + ".global ScaleRowDown38_SSSE3\n" +"ScaleRowDown38_SSSE3:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x28(%esp),%edx\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "movdqa _shuf38a ,%xmm5\n" + "movdqa _shuf38b ,%xmm6\n" + "pxor %xmm7,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa 0x10(%esi),%xmm1\n" + "lea 0x20(%esi),%esi\n" + "pshufb %xmm5,%xmm0\n" + "pshufb %xmm6,%xmm1\n" + "paddusb %xmm1,%xmm0\n" + "movq %xmm0,(%edi)\n" + "movhlps %xmm0,%xmm1\n" + "movd %xmm1,0x8(%edi)\n" + "lea 0xc(%edi),%edi\n" + "sub $0xc,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* iptr, int32 istride, + uint8* optr, int32 owidth); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown38_3_Int_SSSE3\n" +"_ScaleRowDown38_3_Int_SSSE3:\n" +#else + ".global ScaleRowDown38_3_Int_SSSE3\n" +"ScaleRowDown38_3_Int_SSSE3:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x28(%esp),%edx\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "movdqa _shufac0,%xmm4\n" + "movdqa _shufac3,%xmm5\n" + "movdqa _scaleac3,%xmm6\n" + "pxor %xmm7,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa (%esi,%edx,1),%xmm2\n" + "movhlps %xmm0,%xmm1\n" + "movhlps %xmm2,%xmm3\n" + "punpcklbw %xmm7,%xmm0\n" + "punpcklbw %xmm7,%xmm1\n" + "punpcklbw %xmm7,%xmm2\n" + "punpcklbw %xmm7,%xmm3\n" + "paddusw %xmm2,%xmm0\n" + "paddusw %xmm3,%xmm1\n" + "movdqa (%esi,%edx,2),%xmm2\n" + "lea 0x10(%esi),%esi\n" + "movhlps %xmm2,%xmm3\n" + "punpcklbw %xmm7,%xmm2\n" + "punpcklbw %xmm7,%xmm3\n" + "paddusw %xmm2,%xmm0\n" + "paddusw %xmm3,%xmm1\n" + "movdqa %xmm0,%xmm2\n" + "psrldq $0x2,%xmm0\n" + "paddusw %xmm0,%xmm2\n" + "psrldq $0x2,%xmm0\n" + "paddusw %xmm0,%xmm2\n" + "pshufb %xmm4,%xmm2\n" + "movdqa %xmm1,%xmm3\n" + "psrldq $0x2,%xmm1\n" + "paddusw %xmm1,%xmm3\n" + "psrldq $0x2,%xmm1\n" + "paddusw %xmm1,%xmm3\n" + "pshufb %xmm5,%xmm3\n" + "paddusw %xmm3,%xmm2\n" + "pmulhuw %xmm6,%xmm2\n" + "packuswb %xmm2,%xmm2\n" + "movd %xmm2,(%edi)\n" + "pextrw $0x2,%xmm2,%eax\n" + "mov %ax,0x4(%edi)\n" + "lea 0x6(%edi),%edi\n" + "sub $0x6,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* iptr, int32 istride, + uint8* optr, int32 owidth); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown38_2_Int_SSSE3\n" +"_ScaleRowDown38_2_Int_SSSE3:\n" +#else + ".global ScaleRowDown38_2_Int_SSSE3\n" +"ScaleRowDown38_2_Int_SSSE3:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x28(%esp),%edx\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "movdqa _shufab0,%xmm4\n" + "movdqa _shufab1,%xmm5\n" + "movdqa _shufab2,%xmm6\n" + "movdqa _scaleab2,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm2\n" + "pavgb (%esi,%edx,1),%xmm2\n" + "lea 0x10(%esi),%esi\n" + "movdqa %xmm2,%xmm0\n" + "pshufb %xmm4,%xmm0\n" + "movdqa %xmm2,%xmm1\n" + "pshufb %xmm5,%xmm1\n" + "paddusw %xmm1,%xmm0\n" + "pshufb %xmm6,%xmm2\n" + "paddusw %xmm2,%xmm0\n" + "pmulhuw %xmm7,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movd %xmm0,(%edi)\n" + "pextrw $0x2,%xmm0,%eax\n" + "mov %ax,0x4(%edi)\n" + "lea 0x6(%edi),%edi\n" + "sub $0x6,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); +#endif // __PIC__ + +#define HAS_SCALEADDROWS_SSE2 +extern "C" void ScaleAddRows_SSE2(const uint8* iptr, int32 istride, + uint16* orow, int32 iwidth, int32 iheight); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleAddRows_SSE2\n" +"_ScaleAddRows_SSE2:\n" +#else + ".global ScaleAddRows_SSE2\n" +"ScaleAddRows_SSE2:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x28(%esp),%edx\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "mov 0x34(%esp),%ebx\n" + "pxor %xmm7,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm2\n" + "lea (%esi,%edx,1),%eax\n" + "movhlps %xmm2,%xmm3\n" + "lea -0x1(%ebx),%ebp\n" + "punpcklbw %xmm7,%xmm2\n" + "punpcklbw %xmm7,%xmm3\n" + +"2:" + "movdqa (%eax),%xmm0\n" + "lea (%eax,%edx,1),%eax\n" + "movhlps %xmm0,%xmm1\n" + "punpcklbw %xmm7,%xmm0\n" + "punpcklbw %xmm7,%xmm1\n" + "paddusw %xmm0,%xmm2\n" + "paddusw %xmm1,%xmm3\n" + "sub $0x1,%ebp\n" + "ja 2b\n" + + "movdqa %xmm2,(%edi)\n" + "movdqa %xmm3,0x10(%edi)\n" + "lea 0x20(%edi),%edi\n" + "lea 0x10(%esi),%esi\n" + "sub $0x10,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version +#define HAS_SCALEFILTERROWS_SSE2 +extern "C" void ScaleFilterRows_SSE2(uint8* optr, + const uint8* iptr0, int32 istride, + int owidth, int source_y_fraction); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleFilterRows_SSE2\n" +"_ScaleFilterRows_SSE2:\n" +#else + ".global ScaleFilterRows_SSE2\n" +"ScaleFilterRows_SSE2:\n" +#endif + "push %esi\n" + "push %edi\n" + "mov 0xc(%esp),%edi\n" + "mov 0x10(%esp),%esi\n" + "mov 0x14(%esp),%edx\n" + "mov 0x18(%esp),%ecx\n" + "mov 0x1c(%esp),%eax\n" + "cmp $0x0,%eax\n" + "je 2f\n" + "cmp $0x80,%eax\n" + "je 3f\n" + "movd %eax,%xmm6\n" + "punpcklwd %xmm6,%xmm6\n" + "pshufd $0x0,%xmm6,%xmm6\n" + "neg %eax\n" + "add $0x100,%eax\n" + "movd %eax,%xmm5\n" + "punpcklwd %xmm5,%xmm5\n" + "pshufd $0x0,%xmm5,%xmm5\n" + "pxor %xmm7,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa (%esi,%edx,1),%xmm2\n" + "lea 0x10(%esi),%esi\n" + "movdqa %xmm0,%xmm1\n" + "movdqa %xmm2,%xmm3\n" + "punpcklbw %xmm7,%xmm0\n" + "punpcklbw %xmm7,%xmm2\n" + "punpckhbw %xmm7,%xmm1\n" + "punpckhbw %xmm7,%xmm3\n" + "pmullw %xmm5,%xmm0\n" + "pmullw %xmm5,%xmm1\n" + "pmullw %xmm6,%xmm2\n" + "pmullw %xmm6,%xmm3\n" + "paddusw %xmm2,%xmm0\n" + "paddusw %xmm3,%xmm1\n" + "psrlw $0x8,%xmm0\n" + "psrlw $0x8,%xmm1\n" + "packuswb %xmm1,%xmm0\n" + "movdqa %xmm0,(%edi)\n" + "lea 0x10(%edi),%edi\n" + "sub $0x10,%ecx\n" + "ja 1b\n" + "mov -0x1(%edi),%al\n" + "mov %al,(%edi)\n" + "pop %edi\n" + "pop %esi\n" + "ret\n" + +"2:" + "movdqa (%esi),%xmm0\n" + "lea 0x10(%esi),%esi\n" + "movdqa %xmm0,(%edi)\n" + "lea 0x10(%edi),%edi\n" + "sub $0x10,%ecx\n" + "ja 2b\n" + + "mov -0x1(%edi),%al\n" + "mov %al,(%edi)\n" + "pop %edi\n" + "pop %esi\n" + "ret\n" + +"3:" + "movdqa (%esi),%xmm0\n" + "movdqa (%esi,%edx,1),%xmm2\n" + "lea 0x10(%esi),%esi\n" + "pavgb %xmm2,%xmm0\n" + "movdqa %xmm0,(%edi)\n" + "lea 0x10(%edi),%edi\n" + "sub $0x10,%ecx\n" + "ja 3b\n" + + "mov -0x1(%edi),%al\n" + "mov %al,(%edi)\n" + "pop %edi\n" + "pop %esi\n" + "ret\n" +); + +// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version +#define HAS_SCALEFILTERROWS_SSSE3 +extern "C" void ScaleFilterRows_SSSE3(uint8* optr, + const uint8* iptr0, int32 istride, + int owidth, int source_y_fraction); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleFilterRows_SSSE3\n" +"_ScaleFilterRows_SSSE3:\n" +#else + ".global ScaleFilterRows_SSSE3\n" +"ScaleFilterRows_SSSE3:\n" +#endif + "push %esi\n" + "push %edi\n" + "mov 0xc(%esp),%edi\n" + "mov 0x10(%esp),%esi\n" + "mov 0x14(%esp),%edx\n" + "mov 0x18(%esp),%ecx\n" + "mov 0x1c(%esp),%eax\n" + "cmp $0x0,%eax\n" + "je 2f\n" + "cmp $0x80,%eax\n" + "je 3f\n" + "shr %eax\n" + "mov %al,%ah\n" + "neg %al\n" + "add $0x80,%al\n" + "movd %eax,%xmm7\n" + "punpcklwd %xmm7,%xmm7\n" + "pshufd $0x0,%xmm7,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa (%esi,%edx,1),%xmm2\n" + "lea 0x10(%esi),%esi\n" + "movdqa %xmm0,%xmm1\n" + "punpcklbw %xmm2,%xmm0\n" + "punpckhbw %xmm2,%xmm1\n" + "pmaddubsw %xmm7,%xmm0\n" + "pmaddubsw %xmm7,%xmm1\n" + "psrlw $0x7,%xmm0\n" + "psrlw $0x7,%xmm1\n" + "packuswb %xmm1,%xmm0\n" + "movdqa %xmm0,(%edi)\n" + "lea 0x10(%edi),%edi\n" + "sub $0x10,%ecx\n" + "ja 1b\n" + "mov -0x1(%edi),%al\n" + "mov %al,(%edi)\n" + "pop %edi\n" + "pop %esi\n" + "ret\n" + +"2:" + "movdqa (%esi),%xmm0\n" + "lea 0x10(%esi),%esi\n" + "movdqa %xmm0,(%edi)\n" + "lea 0x10(%edi),%edi\n" + "sub $0x10,%ecx\n" + "ja 2b\n" + "mov -0x1(%edi),%al\n" + "mov %al,(%edi)\n" + "pop %edi\n" + "pop %esi\n" + "ret\n" + +"3:" + "movdqa (%esi),%xmm0\n" + "movdqa (%esi,%edx,1),%xmm2\n" + "lea 0x10(%esi),%esi\n" + "pavgb %xmm2,%xmm0\n" + "movdqa %xmm0,(%edi)\n" + "lea 0x10(%edi),%edi\n" + "sub $0x10,%ecx\n" + "ja 3b\n" + "mov -0x1(%edi),%al\n" + "mov %al,(%edi)\n" + "pop %edi\n" + "pop %esi\n" + "ret\n" +); +#endif + +// CPU agnostic row functions +static void ScaleRowDown2_C(const uint8* iptr, int32, + uint8* dst, int32 owidth) { + for (int x = 0; x < owidth; ++x) { + *dst++ = *iptr; + iptr += 2; + } +} + +static void ScaleRowDown2Int_C(const uint8* iptr, int32 istride, + uint8* dst, int32 owidth) { + for (int x = 0; x < owidth; ++x) { + *dst++ = (iptr[0] + iptr[1] + + iptr[istride] + iptr[istride + 1] + 2) >> 2; + iptr += 2; + } +} + +static void ScaleRowDown4_C(const uint8* iptr, int32, + uint8* dst, int32 owidth) { + for (int x = 0; x < owidth; ++x) { + *dst++ = *iptr; + iptr += 4; + } +} + +static void ScaleRowDown4Int_C(const uint8* iptr, int32 istride, + uint8* dst, int32 owidth) { + for (int x = 0; x < owidth; ++x) { + *dst++ = (iptr[0] + iptr[1] + iptr[2] + iptr[3] + + iptr[istride + 0] + iptr[istride + 1] + + iptr[istride + 2] + iptr[istride + 3] + + iptr[istride * 2 + 0] + iptr[istride * 2 + 1] + + iptr[istride * 2 + 2] + iptr[istride * 2 + 3] + + iptr[istride * 3 + 0] + iptr[istride * 3 + 1] + + iptr[istride * 3 + 2] + iptr[istride * 3 + 3] + 8) >> 4; + iptr += 4; + } +} + +// 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down. +// Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu. +static const int kMaxOutputWidth = 640; +static const int kMaxRow12 = kMaxOutputWidth * 2; + +static void ScaleRowDown8_C(const uint8* iptr, int32, + uint8* dst, int32 owidth) { + for (int x = 0; x < owidth; ++x) { + *dst++ = *iptr; + iptr += 8; + } +} + +// Note calling code checks width is less than max and if not +// uses ScaleRowDown8_C instead. +static void ScaleRowDown8Int_C(const uint8* iptr, int32 istride, + uint8* dst, int32 owidth) { + ALIGN16(uint8 irow[kMaxRow12 * 2]); + ASSERT(owidth <= kMaxOutputWidth); + ScaleRowDown4Int_C(iptr, istride, irow, owidth * 2); + ScaleRowDown4Int_C(iptr + istride * 4, istride, irow + kMaxOutputWidth, + owidth * 2); + ScaleRowDown2Int_C(irow, kMaxOutputWidth, dst, owidth); +} + +static void ScaleRowDown34_C(const uint8* iptr, int32, + uint8* dst, int32 owidth) { + ASSERT((owidth % 3 == 0) && (owidth > 0)); + uint8* dend = dst + owidth; + do { + dst[0] = iptr[0]; + dst[1] = iptr[1]; + dst[2] = iptr[3]; + dst += 3; + iptr += 4; + } while (dst < dend); +} + +// Filter rows 0 and 1 together, 3 : 1 +static void ScaleRowDown34_0_Int_C(const uint8* iptr, int32 istride, + uint8* d, int32 owidth) { + ASSERT((owidth % 3 == 0) && (owidth > 0)); + uint8* dend = d + owidth; + const uint8* s = iptr; + const uint8* t = iptr + istride; + do { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 * 3 + b0 + 2) >> 2; + d[1] = (a1 * 3 + b1 + 2) >> 2; + d[2] = (a2 * 3 + b2 + 2) >> 2; + d += 3; + s += 4; + t += 4; + } while (d < dend); +} + +// Filter rows 1 and 2 together, 1 : 1 +static void ScaleRowDown34_1_Int_C(const uint8* iptr, int32 istride, + uint8* d, int32 owidth) { + ASSERT((owidth % 3 == 0) && (owidth > 0)); + uint8* dend = d + owidth; + const uint8* s = iptr; + const uint8* t = iptr + istride; + do { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 + b0 + 1) >> 1; + d[1] = (a1 + b1 + 1) >> 1; + d[2] = (a2 + b2 + 1) >> 1; + d += 3; + s += 4; + t += 4; + } while (d < dend); +} + +#if defined(HAS_SCALEFILTERROWS_SSE2) +// Filter row to 3/4 +static void ScaleFilterCols34_C(uint8* optr, const uint8* iptr, int owidth) { + ASSERT((owidth % 3 == 0) && (owidth > 0)); + uint8* dend = optr + owidth; + const uint8* s = iptr; + do { + optr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2; + optr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1; + optr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2; + optr += 3; + s += 4; + } while (optr < dend); +} +#endif + +static void ScaleFilterCols_C(uint8* optr, const uint8* iptr, + int owidth, int dx) { + int x = 0; + for (int j = 0; j < owidth; ++j) { + int xi = x >> 16; + int xf1 = x & 0xffff; + int xf0 = 65536 - xf1; + + *optr++ = (iptr[xi] * xf0 + iptr[xi + 1] * xf1) >> 16; + x += dx; + } +} + +#ifdef TEST_RSTSC +uint64 timers34[4] = { 0, }; +#endif + +static const int kMaxInputWidth = 2560; +#if defined(HAS_SCALEFILTERROWS_SSE2) +#define HAS_SCALEROWDOWN34_SSE2 +// Filter rows 0 and 1 together, 3 : 1 +static void ScaleRowDown34_0_Int_SSE2(const uint8* iptr, int32 istride, + uint8* d, int32 owidth) { + ASSERT((owidth % 3 == 0) && (owidth > 0)); + ALIGN16(uint8 row[kMaxInputWidth]); +#ifdef TEST_RSTSC + uint64 t1 = __rdtsc(); +#endif + ScaleFilterRows_SSE2(row, iptr, istride, owidth * 4 / 3, 256 / 4); +#ifdef TEST_RSTSC + uint64 t2 = __rdtsc(); +#endif + ScaleFilterCols34_C(d, row, owidth); + +#ifdef TEST_RSTSC + uint64 t3 = __rdtsc(); + timers34[0] += t2 - t1; + timers34[1] += t3 - t2; +#endif +} + +// Filter rows 1 and 2 together, 1 : 1 +static void ScaleRowDown34_1_Int_SSE2(const uint8* iptr, int32 istride, + uint8* d, int32 owidth) { + ASSERT((owidth % 3 == 0) && (owidth > 0)); + ALIGN16(uint8 row[kMaxInputWidth]); +#ifdef TEST_RSTSC + uint64 t1 = __rdtsc(); +#endif + ScaleFilterRows_SSE2(row, iptr, istride, owidth * 4 / 3, 256 / 2); +#ifdef TEST_RSTSC + uint64 t2 = __rdtsc(); +#endif + ScaleFilterCols34_C(d, row, owidth); +#ifdef TEST_RSTSC + uint64 t3 = __rdtsc(); + timers34[2] += t2 - t1; + timers34[3] += t3 - t2; +#endif +} +#endif + +static void ScaleRowDown38_C(const uint8* iptr, int32, + uint8* dst, int32 owidth) { + ASSERT(owidth % 3 == 0); + for (int x = 0; x < owidth; x += 3) { + dst[0] = iptr[0]; + dst[1] = iptr[3]; + dst[2] = iptr[6]; + dst += 3; + iptr += 8; + } +} + +// 8x3 -> 3x1 +static void ScaleRowDown38_3_Int_C(const uint8* iptr, int32 istride, + uint8* optr, int32 owidth) { + ASSERT((owidth % 3 == 0) && (owidth > 0)); + for (int i = 0; i < owidth; i+=3) { + optr[0] = (iptr[0] + iptr[1] + iptr[2] + + iptr[istride + 0] + iptr[istride + 1] + iptr[istride + 2] + + iptr[istride * 2 + 0] + iptr[istride * 2 + 1] + iptr[istride * 2 + 2]) * + (65536 / 9) >> 16; + optr[1] = (iptr[3] + iptr[4] + iptr[5] + + iptr[istride + 3] + iptr[istride + 4] + iptr[istride + 5] + + iptr[istride * 2 + 3] + iptr[istride * 2 + 4] + iptr[istride * 2 + 5]) * + (65536 / 9) >> 16; + optr[2] = (iptr[6] + iptr[7] + + iptr[istride + 6] + iptr[istride + 7] + + iptr[istride * 2 + 6] + iptr[istride * 2 + 7]) * + (65536 / 6) >> 16; + iptr += 8; + optr += 3; + } +} + +// 8x2 -> 3x1 +static void ScaleRowDown38_2_Int_C(const uint8* iptr, int32 istride, + uint8* optr, int32 owidth) { + ASSERT((owidth % 3 == 0) && (owidth > 0)); + for (int i = 0; i < owidth; i+=3) { + optr[0] = (iptr[0] + iptr[1] + iptr[2] + + iptr[istride + 0] + iptr[istride + 1] + iptr[istride + 2]) * + (65536 / 6) >> 16; + optr[1] = (iptr[3] + iptr[4] + iptr[5] + + iptr[istride + 3] + iptr[istride + 4] + iptr[istride + 5]) * + (65536 / 6) >> 16; + optr[2] = (iptr[6] + iptr[7] + + iptr[istride + 6] + iptr[istride + 7]) * + (65536 / 4) >> 16; + iptr += 8; + optr += 3; + } +} + +// C version 8x2 -> 8x1 +static void ScaleFilterRows_C(uint8* optr, + const uint8* iptr0, int32 istride, + int owidth, int source_y_fraction) { + ASSERT(owidth > 0); + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8* iptr1 = iptr0 + istride; + uint8* end = optr + owidth; + do { + optr[0] = (iptr0[0] * y0_fraction + iptr1[0] * y1_fraction) >> 8; + optr[1] = (iptr0[1] * y0_fraction + iptr1[1] * y1_fraction) >> 8; + optr[2] = (iptr0[2] * y0_fraction + iptr1[2] * y1_fraction) >> 8; + optr[3] = (iptr0[3] * y0_fraction + iptr1[3] * y1_fraction) >> 8; + optr[4] = (iptr0[4] * y0_fraction + iptr1[4] * y1_fraction) >> 8; + optr[5] = (iptr0[5] * y0_fraction + iptr1[5] * y1_fraction) >> 8; + optr[6] = (iptr0[6] * y0_fraction + iptr1[6] * y1_fraction) >> 8; + optr[7] = (iptr0[7] * y0_fraction + iptr1[7] * y1_fraction) >> 8; + iptr0 += 8; + iptr1 += 8; + optr += 8; + } while (optr < end); + optr[0] = optr[-1]; +} + +void ScaleAddRows_C(const uint8* iptr, int32 istride, + uint16* orow, int32 iwidth, int32 iheight) { + ASSERT(iwidth > 0); + ASSERT(iheight > 0); + for (int x = 0; x < iwidth; ++x) { + const uint8* s = iptr + x; + int sum = 0; + for (int y = 0; y < iheight; ++y) { + sum += s[0]; + s += istride; + } + orow[x] = sum; + } +} + +/** + * Scale plane, 1/2 + * + * This is an optimized version for scaling down a plane to 1/2 of + * its original size. + * + */ +static void ScalePlaneDown2(int32 iwidth, int32 iheight, + int32 owidth, int32 oheight, + int32 istride, int32 ostride, + const uint8 *iptr, uint8 *optr, + bool interpolate) { + ASSERT(iwidth % 2 == 0); + ASSERT(iheight % 2 == 0); + void (*ScaleRowDown2)(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth); + +#if defined(HAS_SCALEROWDOWN2_NEON) + if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasNEON) && + (owidth % 16 == 0) && (istride % 16 == 0) && (ostride % 16 == 0) && + IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 16)) { + ScaleRowDown2 = interpolate ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON; + } else +#endif +#if defined(HAS_SCALEROWDOWN2_SSE2) + if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSE2) && + (owidth % 16 == 0) && IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 16)) { + ScaleRowDown2 = interpolate ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2; + } else +#endif + { + ScaleRowDown2 = interpolate ? ScaleRowDown2Int_C : ScaleRowDown2_C; + } + + for (int y = 0; y < oheight; ++y) { + ScaleRowDown2(iptr, istride, optr, owidth); + iptr += (istride << 1); + optr += ostride; + } +} + +/** + * Scale plane, 1/4 + * + * This is an optimized version for scaling down a plane to 1/4 of + * its original size. + */ +static void ScalePlaneDown4(int32 iwidth, int32 iheight, + int32 owidth, int32 oheight, + int32 istride, int32 ostride, + const uint8 *iptr, uint8 *optr, + bool interpolate) { + ASSERT(iwidth % 4 == 0); + ASSERT(iheight % 4 == 0); + void (*ScaleRowDown4)(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth); + +#if defined(HAS_SCALEROWDOWN4_SSE2) + if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSE2) && + (owidth % 8 == 0) && (istride % 16 == 0) && (ostride % 8 == 0) && + IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 8)) { + ScaleRowDown4 = interpolate ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2; + } else +#endif + { + ScaleRowDown4 = interpolate ? ScaleRowDown4Int_C : ScaleRowDown4_C; + } + + for (int y = 0; y < oheight; ++y) { + ScaleRowDown4(iptr, istride, optr, owidth); + iptr += (istride << 2); + optr += ostride; + } +} + +/** + * Scale plane, 1/8 + * + * This is an optimized version for scaling down a plane to 1/8 + * of its original size. + * + */ +static void ScalePlaneDown8(int32 iwidth, int32 iheight, + int32 owidth, int32 oheight, + int32 istride, int32 ostride, + const uint8 *iptr, uint8 *optr, + bool interpolate) { + ASSERT(iwidth % 8 == 0); + ASSERT(iheight % 8 == 0); + void (*ScaleRowDown8)(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth); +#if defined(HAS_SCALEROWDOWN8_SSE2) + if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSE2) && + (owidth % 16 == 0) && owidth <= kMaxOutputWidth && + (istride % 16 == 0) && (ostride % 16 == 0) && + IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 16)) { + ScaleRowDown8 = interpolate ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2; + } else +#endif + { + ScaleRowDown8 = interpolate && (owidth <= kMaxOutputWidth) ? + ScaleRowDown8Int_C : ScaleRowDown8_C; + } + for (int y = 0; y < oheight; ++y) { + ScaleRowDown8(iptr, istride, optr, owidth); + iptr += (istride << 3); + optr += ostride; + } +} + +/** + * Scale plane down, 3/4 + * + * Provided by Frank Barchard (fbarchard@google.com) + * + */ +static void ScalePlaneDown34(int32 iwidth, int32 iheight, + int32 owidth, int32 oheight, + int32 istride, int32 ostride, + const uint8* iptr, uint8* optr, + bool interpolate) { + ASSERT(owidth % 3 == 0); + void (*ScaleRowDown34_0)(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth); + void (*ScaleRowDown34_1)(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth); +#if defined(HAS_SCALEROWDOWN34_SSSE3) + if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSSE3) && + (owidth % 24 == 0) && (istride % 16 == 0) && (ostride % 8 == 0) && + IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 8)) { + if (!interpolate) { + ScaleRowDown34_0 = ScaleRowDown34_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3; + } + } else +#endif +#if defined(HAS_SCALEROWDOWN34_SSE2) + if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSE2) && + (owidth % 24 == 0) && (istride % 16 == 0) && (ostride % 8 == 0) && + IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 8) && + interpolate) { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2; + } else +#endif + { + if (!interpolate) { + ScaleRowDown34_0 = ScaleRowDown34_C; + ScaleRowDown34_1 = ScaleRowDown34_C; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_C; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_C; + } + } + int irow = 0; + for (int y = 0; y < oheight; ++y) { + switch (irow) { + case 0: + ScaleRowDown34_0(iptr, istride, optr, owidth); + break; + + case 1: + ScaleRowDown34_1(iptr, istride, optr, owidth); + break; + + case 2: + ScaleRowDown34_0(iptr + istride, -istride, optr, owidth); + break; + } + ++irow; + iptr += istride; + optr += ostride; + if (irow >= 3) { + iptr += istride; + irow = 0; + } + } + +#ifdef TEST_RSTSC + std::cout << "Timer34_0 Row " << std::setw(9) << timers34[0] + << " Column " << std::setw(9) << timers34[1] + << " Timer34_1 Row " << std::setw(9) << timers34[2] + << " Column " << std::setw(9) << timers34[3] << std::endl; +#endif +} + +/** + * Scale plane, 3/8 + * + * This is an optimized version for scaling down a plane to 3/8 + * of its original size. + * + * Reduces 16x3 to 6x1 + */ +static void ScalePlaneDown38(int32 iwidth, int32 iheight, + int32 owidth, int32 oheight, + int32 istride, int32 ostride, + const uint8* iptr, uint8* optr, + bool interpolate) { + ASSERT(owidth % 3 == 0); + void (*ScaleRowDown38_3)(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth); + void (*ScaleRowDown38_2)(const uint8* iptr, int32 istride, + uint8* orow, int32 owidth); +#if defined(HAS_SCALEROWDOWN38_SSSE3) + if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSSE3) && + (owidth % 24 == 0) && (istride % 16 == 0) && (ostride % 8 == 0) && + IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 8)) { + if (!interpolate) { + ScaleRowDown38_3 = ScaleRowDown38_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_SSSE3; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3; + } + } else +#endif + { + if (!interpolate) { + ScaleRowDown38_3 = ScaleRowDown38_C; + ScaleRowDown38_2 = ScaleRowDown38_C; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Int_C; + ScaleRowDown38_2 = ScaleRowDown38_2_Int_C; + } + } + int irow = 0; + for (int y = 0; y < oheight; ++y) { + switch (irow) { + case 0: + case 1: + ScaleRowDown38_3(iptr, istride, optr, owidth); + iptr += istride * 3; + ++irow; + break; + + case 2: + ScaleRowDown38_2(iptr, istride, optr, owidth); + iptr += istride * 2; + irow = 0; + break; + } + optr += ostride; + } +} + +inline static uint32 SumBox(int32 iboxwidth, int32 iboxheight, + int32 istride, const uint8 *iptr) { + ASSERT(iboxwidth > 0); + ASSERT(iboxheight > 0); + uint32 sum = 0u; + for (int y = 0; y < iboxheight; ++y) { + for (int x = 0; x < iboxwidth; ++x) { + sum += iptr[x]; + } + iptr += istride; + } + return sum; +} + +static void ScalePlaneBoxRow(int32 owidth, int32 boxheight, + int dx, int32 istride, + const uint8 *iptr, uint8 *optr) { + int x = 0; + for (int i = 0; i < owidth; ++i) { + int ix = x >> 16; + x += dx; + int boxwidth = (x >> 16) - ix; + *optr++ = SumBox(boxwidth, boxheight, istride, iptr + ix) / + (boxwidth * boxheight); + } +} + +inline static uint32 SumPixels(int32 iboxwidth, const uint16 *iptr) { + ASSERT(iboxwidth > 0); + uint32 sum = 0u; + for (int x = 0; x < iboxwidth; ++x) { + sum += iptr[x]; + } + return sum; +} + +static void ScaleAddCols2_C(int32 owidth, int32 boxheight, int dx, + const uint16 *iptr, uint8 *optr) { + int scaletbl[2]; + int minboxwidth = (dx >> 16); + scaletbl[0] = 65536 / (minboxwidth * boxheight); + scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); + int *scaleptr = scaletbl - minboxwidth; + int x = 0; + for (int i = 0; i < owidth; ++i) { + int ix = x >> 16; + x += dx; + int boxwidth = (x >> 16) - ix; + *optr++ = SumPixels(boxwidth, iptr + ix) * scaleptr[boxwidth] >> 16; + } +} + +static void ScaleAddCols1_C(int32 owidth, int32 boxheight, int dx, + const uint16 *iptr, uint8 *optr) { + int boxwidth = (dx >> 16); + int scaleval = 65536 / (boxwidth * boxheight); + int x = 0; + for (int i = 0; i < owidth; ++i) { + *optr++ = SumPixels(boxwidth, iptr + x) * scaleval >> 16; + x += boxwidth; + } +} + +/** + * Scale plane down to any dimensions, with interpolation. + * (boxfilter). + * + * Same method as SimpleScale, which is fixed point, outputting + * one pixel of destination using fixed point (16.16) to step + * through source, sampling a box of pixel with simple + * averaging. + */ +static void ScalePlaneBox(int32 iwidth, int32 iheight, + int32 owidth, int32 oheight, + int32 istride, int32 ostride, + const uint8 *iptr, uint8 *optr) { + ASSERT(owidth > 0); + ASSERT(oheight > 0); + int dy = (iheight << 16) / oheight; + int dx = (iwidth << 16) / owidth; + if ((iwidth % 16 != 0) || (iwidth > kMaxInputWidth) || + oheight * 2 > iheight) { + uint8 *dst = optr; + int dy = (iheight << 16) / oheight; + int dx = (iwidth << 16) / owidth; + int y = 0; + for (int j = 0; j < oheight; ++j) { + int iy = y >> 16; + const uint8 *const src = iptr + iy * istride; + y += dy; + if (y > (iheight << 16)) { + y = (iheight << 16); + } + int boxheight = (y >> 16) - iy; + ScalePlaneBoxRow(owidth, boxheight, + dx, istride, + src, dst); + + dst += ostride; + } + } else { + ALIGN16(uint16 row[kMaxInputWidth]); + void (*ScaleAddRows)(const uint8* iptr, int32 istride, + uint16* orow, int32 iwidth, int32 iheight); + void (*ScaleAddCols)(int32 owidth, int32 boxheight, int dx, + const uint16 *iptr, uint8 *optr); +#if defined(HAS_SCALEADDROWS_SSE2) + if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSE2) && + (istride % 16 == 0) && IS_ALIGNED(iptr, 16) && (iwidth % 16) == 0) { + ScaleAddRows = ScaleAddRows_SSE2; + } else +#endif + { + ScaleAddRows = ScaleAddRows_C; + } + if (dx & 0xffff) { + ScaleAddCols = ScaleAddCols2_C; + } else { + ScaleAddCols = ScaleAddCols1_C; + } + + int y = 0; + for (int j = 0; j < oheight; ++j) { + int iy = y >> 16; + const uint8 *const src = iptr + iy * istride; + y += dy; + if (y > (iheight << 16)) { + y = (iheight << 16); + } + int boxheight = (y >> 16) - iy; + ScaleAddRows(src, istride, row, iwidth, boxheight); + ScaleAddCols(owidth, boxheight, dx, row, optr); + optr += ostride; + } + } +} + +/** + * Scale plane to/from any dimensions, with interpolation. + */ +static void ScalePlaneBilinearSimple(int32 iwidth, int32 iheight, + int32 owidth, int32 oheight, + int32 istride, int32 ostride, + const uint8 *iptr, uint8 *optr) { + uint8 *dst = optr; + int dx = (iwidth << 16) / owidth; + int dy = (iheight << 16) / oheight; + int maxx = ((iwidth - 1) << 16) - 1; + int maxy = ((iheight - 1) << 16) - 1; + int y = (oheight < iheight) ? 32768 : (iheight << 16) / oheight - 32768; + for (int i = 0; i < oheight; ++i) { + int cy = (y < 0) ? 0 : y; + int yi = cy >> 16; + int yf = cy & 0xffff; + const uint8 *const src = iptr + yi * istride; + int x = (owidth < iwidth) ? 32768 : (iwidth << 16) / owidth - 32768; + for (int j = 0; j < owidth; ++j) { + int cx = (x < 0) ? 0 : x; + int xi = cx >> 16; + int xf = cx & 0xffff; + int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16; + int r1 = (src[xi + istride] * (65536 - xf) + src[xi + istride + 1] * xf) + >> 16; + *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16; + x += dx; + if (x > maxx) + x = maxx; + } + dst += ostride - owidth; + y += dy; + if (y > maxy) + y = maxy; + } +} + +/** + * Scale plane to/from any dimensions, with bilinear + * interpolation. + */ +static void ScalePlaneBilinear(int32 iwidth, int32 iheight, + int32 owidth, int32 oheight, + int32 istride, int32 ostride, + const uint8 *iptr, uint8 *optr) { + ASSERT(owidth > 0); + ASSERT(oheight > 0); + int dy = (iheight << 16) / oheight; + int dx = (iwidth << 16) / owidth; + if ((iwidth % 8 != 0) || (iwidth > kMaxInputWidth)) { + ScalePlaneBilinearSimple(iwidth, iheight, owidth, oheight, istride, ostride, + iptr, optr); + + } else { + ALIGN16(uint8 row[kMaxInputWidth + 1]); + void (*ScaleFilterRows)(uint8* optr, const uint8* iptr0, int32 istride, + int owidth, int source_y_fraction); + void (*ScaleFilterCols)(uint8* optr, const uint8* iptr, + int owidth, int dx); +#if defined(HAS_SCALEFILTERROWS_SSSE3) + if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSSE3) && + (istride % 16 == 0) && IS_ALIGNED(iptr, 16) && (iwidth % 16) == 0) { + ScaleFilterRows = ScaleFilterRows_SSSE3; + } else +#endif +#if defined(HAS_SCALEFILTERROWS_SSE2) + if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSE2) && + (istride % 16 == 0) && IS_ALIGNED(iptr, 16) && (iwidth % 16) == 0) { + ScaleFilterRows = ScaleFilterRows_SSE2; + } else +#endif + { + ScaleFilterRows = ScaleFilterRows_C; + } + ScaleFilterCols = ScaleFilterCols_C; + + int y = 0; + int maxy = ((iheight - 1) << 16) - 1; // max is filter of last 2 rows. + for (int j = 0; j < oheight; ++j) { + int iy = y >> 16; + int fy = (y >> 8) & 255; + const uint8 *const src = iptr + iy * istride; + ScaleFilterRows(row, src, istride, iwidth, fy); + ScaleFilterCols(optr, row, owidth, dx); + optr += ostride; + y += dy; + if (y > maxy) { + y = maxy; + } + } + } +} + +/** + * Scale plane to/from any dimensions, without interpolation. + * Fixed point math is used for performance: The upper 16 bits + * of x and dx is the integer part of the source position and + * the lower 16 bits are the fixed decimal part. + */ +static void ScalePlaneSimple(int32 iwidth, int32 iheight, + int32 owidth, int32 oheight, + int32 istride, int32 ostride, + const uint8 *iptr, uint8 *optr) { + uint8 *dst = optr; + int dx = (iwidth << 16) / owidth; + for (int y = 0; y < oheight; ++y) { + const uint8 *const src = iptr + (y * iheight / oheight) * istride; + // TODO(fbarchard): Round X coordinate by setting x=0x8000. + int x = 0; + for (int i = 0; i < owidth; ++i) { + *dst++ = src[x >> 16]; + x += dx; + } + dst += ostride - owidth; + } +} + +/** + * Scale plane to/from any dimensions. + */ +static void ScalePlaneAnySize(int32 iwidth, int32 iheight, + int32 owidth, int32 oheight, + int32 istride, int32 ostride, + const uint8 *iptr, uint8 *optr, + bool interpolate) { + if (!interpolate) { + ScalePlaneSimple(iwidth, iheight, owidth, oheight, istride, ostride, + iptr, optr); + } else { + // fall back to non-optimized version + ScalePlaneBilinear(iwidth, iheight, owidth, oheight, istride, ostride, + iptr, optr); + } +} + +/** + * Scale plane down, any size + * + * This is an optimized version for scaling down a plane to any size. + * The current implementation is ~10 times faster compared to the + * reference implementation for e.g. XGA->LowResPAL + * + */ +static void ScalePlaneDown(int32 iwidth, int32 iheight, + int32 owidth, int32 oheight, + int32 istride, int32 ostride, + const uint8 *iptr, uint8 *optr, + bool interpolate) { + if (!interpolate) { + ScalePlaneSimple(iwidth, iheight, owidth, oheight, istride, ostride, + iptr, optr); + } else if (iheight * 2 > oheight) { // between 1/2x and 1x use bilinear + ScalePlaneBilinear(iwidth, iheight, owidth, oheight, istride, ostride, + iptr, optr); + } else { + ScalePlaneBox(iwidth, iheight, owidth, oheight, istride, ostride, + iptr, optr); + } +} + +/** + * Copy plane, no scaling + * + * This simply copies the given plane without scaling. + * The current implementation is ~115 times faster + * compared to the reference implementation. + * + */ +static void CopyPlane(int32 iwidth, int32 iheight, + int32 owidth, int32 oheight, + int32 istride, int32 ostride, + const uint8 *iptr, uint8 *optr) { + if (istride == iwidth && ostride == owidth) { + // All contiguous, so can use REALLY fast path. + memcpy(optr, iptr, iwidth * iheight); + } else { + // Not all contiguous; must copy scanlines individually + const uint8 *src = iptr; + uint8 *dst = optr; + for (int i = 0; i < iheight; ++i) { + memcpy(dst, src, iwidth); + dst += ostride; + src += istride; + } + } +} + +static void ScalePlane(const uint8 *in, int32 istride, + int32 iwidth, int32 iheight, + uint8 *out, int32 ostride, + int32 owidth, int32 oheight, + bool interpolate, bool use_ref) { + // Use specialized scales to improve performance for common resolutions. + // For example, all the 1/2 scalings will use ScalePlaneDown2() + if (owidth == iwidth && oheight == iheight) { + // Straight copy. + CopyPlane(iwidth, iheight, owidth, oheight, istride, ostride, in, out); + } else if (owidth <= iwidth && oheight <= iheight) { + // Scale down. + if (use_ref) { + // For testing, allow the optimized versions to be disabled. + ScalePlaneDown(iwidth, iheight, owidth, oheight, istride, ostride, + in, out, interpolate); + } else if (4 * owidth == 3 * iwidth && 4 * oheight == 3 * iheight) { + // optimized, 3/4 + ScalePlaneDown34(iwidth, iheight, owidth, oheight, istride, ostride, + in, out, interpolate); + } else if (2 * owidth == iwidth && 2 * oheight == iheight) { + // optimized, 1/2 + ScalePlaneDown2(iwidth, iheight, owidth, oheight, istride, ostride, + in, out, interpolate); + // 3/8 rounded up for odd sized chroma height. + } else if (8 * owidth == 3 * iwidth && oheight == ((iheight * 3 + 7) / 8)) { + // optimized, 3/8 + ScalePlaneDown38(iwidth, iheight, owidth, oheight, istride, ostride, + in, out, interpolate); + } else if (4 * owidth == iwidth && 4 * oheight == iheight) { + // optimized, 1/4 + ScalePlaneDown4(iwidth, iheight, owidth, oheight, istride, ostride, + in, out, interpolate); + } else if (8 * owidth == iwidth && 8 * oheight == iheight) { + // optimized, 1/8 + ScalePlaneDown8(iwidth, iheight, owidth, oheight, istride, ostride, + in, out, interpolate); + } else { + // Arbitrary downsample + ScalePlaneDown(iwidth, iheight, owidth, oheight, istride, ostride, + in, out, interpolate); + } + } else { + // Arbitrary scale up and/or down. + ScalePlaneAnySize(iwidth, iheight, owidth, oheight, istride, ostride, + in, out, interpolate); + } +} + +/** + * Scale a plane. + * + * This function in turn calls a scaling function + * suitable for handling the desired resolutions. + * + */ +bool YuvScaler::Scale(const uint8 *inY, const uint8 *inU, const uint8 *inV, + int32 istrideY, int32 istrideU, int32 istrideV, + int32 iwidth, int32 iheight, + uint8 *outY, uint8 *outU, uint8 *outV, + int32 ostrideY, int32 ostrideU, int32 ostrideV, + int32 owidth, int32 oheight, + bool interpolate) { + if (!inY || !inU || !inV || iwidth <= 0 || iheight <= 0 || + !outY || !outU || !outV || owidth <= 0 || oheight <= 0) { + return false; + } + int32 halfiwidth = (iwidth + 1) >> 1; + int32 halfiheight = (iheight + 1) >> 1; + int32 halfowidth = (owidth + 1) >> 1; + int32 halfoheight = (oheight + 1) >> 1; + + ScalePlane(inY, istrideY, iwidth, iheight, + outY, ostrideY, owidth, oheight, + interpolate, use_reference_impl_); + ScalePlane(inU, istrideU, halfiwidth, halfiheight, + outU, ostrideU, halfowidth, halfoheight, + interpolate, use_reference_impl_); + ScalePlane(inV, istrideV, halfiwidth, halfiheight, + outV, ostrideV, halfowidth, halfoheight, + interpolate, use_reference_impl_); + return true; +} + +bool YuvScaler::Scale(const uint8 *in, int32 iwidth, int32 iheight, + uint8 *out, int32 owidth, int32 oheight, int32 ooffset, + bool interpolate) { + if (!in || iwidth <= 0 || iheight <= 0 || + !out || owidth <= 0 || oheight <= 0 || ooffset < 0 || + ooffset >= oheight) { + return false; + } + ooffset = ooffset & ~1; // chroma requires offset to multiple of 2. + int32 halfiwidth = (iwidth + 1) >> 1; + int32 halfiheight = (iheight + 1) >> 1; + int32 halfowidth = (owidth + 1) >> 1; + int32 halfoheight = (oheight + 1) >> 1; + int32 aheight = oheight - ooffset * 2; // actual output height + const uint8 *const iyptr = in; + uint8 *oyptr = out + ooffset * owidth; + const uint8 *const iuptr = in + iwidth * iheight; + uint8 *ouptr = out + owidth * oheight + (ooffset >> 1) * halfowidth; + const uint8 *const ivptr = in + iwidth * iheight + + halfiwidth * halfiheight; + uint8 *ovptr = out + owidth * oheight + halfowidth * halfoheight + + (ooffset >> 1) * halfowidth; + return Scale(iyptr, iuptr, ivptr, iwidth, halfiwidth, halfiwidth, + iwidth, iheight, oyptr, ouptr, ovptr, owidth, + halfowidth, halfowidth, owidth, aheight, interpolate); +} + +} // namespace libyuv diff --git a/source/video_common.cc b/source/video_common.cc new file mode 100644 index 000000000..5152781e1 --- /dev/null +++ b/source/video_common.cc @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "video_common.h" + +#include + +#include "common.h" + +namespace libyuv { + +struct FourCCAliasEntry { + uint32 alias; + uint32 canonical; +}; + +static const FourCCAliasEntry kFourCCAliases[] = { + {FOURCC_IYUV, FOURCC_I420}, + {FOURCC_YU12, FOURCC_I420}, + {FOURCC_YUYV, FOURCC_YUY2}, + {FOURCC_YUVS, FOURCC_YUY2}, + {FOURCC_HDYC, FOURCC_UYVY}, + {FOURCC_2VUY, FOURCC_UYVY}, + {FOURCC_BA81, FOURCC_BGGR}, + {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not. + {FOURCC_RGB3, FOURCC_RAW}, + {FOURCC_BGR3, FOURCC_24BG}, +}; + +uint32 CanonicalFourCC(uint32 fourcc) { + for (int i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) { + if (kFourCCAliases[i].alias == fourcc) { + return kFourCCAliases[i].canonical; + } + } + // Not an alias, so return it as-is. + return fourcc; +} + +std::string VideoFormat::ToString() const { + std::string fourcc_name = GetFourccName(fourcc) + " "; + for (std::string::const_iterator i = fourcc_name.begin(); + i < fourcc_name.end(); ++i) { + // Test character is printable; Avoid isprint() which asserts on negatives + if (*i < 32 || *i >= 127) { + fourcc_name = ""; + break; + } + } + + std::ostringstream ss; + ss << fourcc_name << width << "x" << height << "x" << IntervalToFps(interval); + return ss.str(); +} + +} // namespace libyuv diff --git a/source/video_common.h b/source/video_common.h new file mode 100644 index 000000000..c936c4cfc --- /dev/null +++ b/source/video_common.h @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* +* Common definitions for video, including fourcc and VideoFormat +*/ + + +#ifndef LIBYUV_SOURCE_VIDEO_COMMON_H_ +#define LIBYUV_SOURCE_VIDEO_COMMON_H_ + +#include + +#include "basic_types.h" + +namespace libyuv { + +////////////////////////////////////////////////////////////////////////////// +// Definition of fourcc. +////////////////////////////////////////////////////////////////////////////// +// Convert four characters to a fourcc code. +// Needs to be a macro otherwise the OS X compiler complains when the kFormat* +// constants are used in a switch. +#define FOURCC(a, b, c, d) (\ + (static_cast(a)) | (static_cast(b) << 8) | \ + (static_cast(c) << 16) | (static_cast(d) << 24)) + +// Get the name, that is, string with four characters, of a fourcc code. +inline std::string GetFourccName(uint32 fourcc) { + std::string name; + name.push_back(static_cast(fourcc & 0xFF)); + name.push_back(static_cast((fourcc >> 8) & 0xFF)); + name.push_back(static_cast((fourcc >> 16) & 0xFF)); + name.push_back(static_cast((fourcc >> 24) & 0xFF)); + return name; +} + +// Some good pages discussing FourCC codes: +// http://developer.apple.com/quicktime/icefloe/dispatch020.html +// http://www.fourcc.org/yuv.php +enum FourCC { + // Canonical fourcc codes used in our code. + FOURCC_I420 = FOURCC('I', '4', '2', '0'), + FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'), + FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), + FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), + FOURCC_M420 = FOURCC('M', '4', '2', '0'), + FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), + FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), + FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), + FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), + FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'), + FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), + FOURCC_NV21 = FOURCC('N', 'V', '2', '1'), + FOURCC_NV12 = FOURCC('N', 'V', '1', '2'), + // Next four are Bayer RGB formats. The four characters define the order of + // the colours in each 2x2 pixel grid, going left-to-right and top-to-bottom. + FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'), + FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'), + FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'), + FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'), + + // Aliases for canonical fourcc codes, replaced with their canonical + // equivalents by CanonicalFourCC(). + FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420 + FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Alias for I420 + FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2 + FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac + FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY + FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY + FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG + FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR + FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW + FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG + + // Match any fourcc. + FOURCC_ANY = 0xFFFFFFFF, +}; + +// Converts fourcc aliases into canonical ones. +uint32 CanonicalFourCC(uint32 fourcc); + +////////////////////////////////////////////////////////////////////////////// +// Definition of VideoFormat. +////////////////////////////////////////////////////////////////////////////// + +static const int64 kNumNanosecsPerSec = 1000000000; + +struct VideoFormat { + static const int64 kMinimumInterval = kNumNanosecsPerSec / 10000; // 10k fps + + VideoFormat() : width(0), height(0), interval(0), fourcc(0) {} + + VideoFormat(int w, int h, int64 interval_ns, uint32 cc) + : width(w), + height(h), + interval(interval_ns), + fourcc(cc) { + } + + VideoFormat(const VideoFormat& format) + : width(format.width), + height(format.height), + interval(format.interval), + fourcc(format.fourcc) { + } + + static int64 FpsToInterval(int fps) { + return fps ? kNumNanosecsPerSec / fps : kMinimumInterval; + } + + static int IntervalToFps(int64 interval) { + // Normalize the interval first. + interval = libyuv::_max(interval, kMinimumInterval); + return static_cast(kNumNanosecsPerSec / interval); + } + + bool operator==(const VideoFormat& format) const { + return width == format.width && height == format.height && + interval == format.interval && fourcc == format.fourcc; + } + + bool operator!=(const VideoFormat& format) const { + return !(*this == format); + } + + bool operator<(const VideoFormat& format) const { + return (fourcc < format.fourcc) || + (fourcc == format.fourcc && width < format.width) || + (fourcc == format.fourcc && width == format.width && + height < format.height) || + (fourcc == format.fourcc && width == format.width && + height == format.height && interval > format.interval); + } + + int framerate() const { return IntervalToFps(interval); } + + // Check if both width and height are 0. + bool IsSize0x0() const { return 0 == width && 0 == height; } + + // Check if this format is less than another one by comparing the resolution + // and frame rate. + bool IsPixelRateLess(const VideoFormat& format) const { + return width * height * framerate() < + format.width * format.height * format.framerate(); + } + + // Get a string presentation in the form of "fourcc width x height x fps" + std::string ToString() const; + + int width; // in number of pixels + int height; // in number of pixels + int64 interval; // in nanoseconds + uint32 fourcc; // color space. FOURCC_ANY means that any color space is OK. +}; + +// Result of video capturer start. +enum CaptureResult { + CR_SUCCESS, // The capturer starts successfully. + CR_PENDING, // The capturer is pending to start the capture device. + CR_FAILURE, // The capturer fails to start. + CR_NO_DEVICE, // The capturer has no device and fails to start. +}; + +} // namespace libyuv + +#endif // LIBYUV_SOURCE_VIDEO_COMMON_H_