diff --git a/common/common.h b/common/common.h deleted file mode 100644 index 79e9af616..000000000 --- a/common/common.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef LIBYUV_SOURCE_COMMON_H_ -#define LIBYUV_SOURCE_COMMON_H_ - -#if defined(_MSC_VER) -// warning C4355: 'this' : used in base member initializer list -#pragma warning(disable:4355) -#endif - -#ifndef ENABLE_DEBUG -#define ENABLE_DEBUG _DEBUG -#endif // !defined(ENABLE_DEBUG) - -#if ENABLE_DEBUG - -#if defined(_MSC_VER) && _MSC_VER < 1300 -#define __FUNCTION__ "" -#endif -#else // !ENABLE_DEBUG - -#endif // !ENABLE_DEBUG - -// Forces compiler to inline, even against its better judgement. Use wisely. -#if defined(__GNUC__) -#define FORCE_INLINE __attribute__((always_inline)) -#elif defined(WIN32) -#define FORCE_INLINE __forceinline -#else -#define FORCE_INLINE -#endif - -#endif // LIBYUV_SOURCE_COMMON_H_ diff --git a/include/format_conversion.h b/include/format_conversion.h deleted file mode 100644 index e73d4e50a..000000000 --- a/include/format_conversion.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef LIBYUV_INCLUDE_FORMATCONVERSION_H_ -#define LIBYUV_INCLUDE_FORMATCONVERSION_H_ - -#include "basic_types.h" - -namespace libyuv { - -// Converts any Bayer RGB format to I420. -void BayerRGBToI420(const uint8* src_bayer, int src_pitch_bayer, - uint32 src_fourcc_bayer, - uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - int width, int height); - -// Converts any Bayer RGB format to ARGB. -void BayerRGBToARGB(const uint8* src_bayer, int src_pitch_bayer, - uint32 src_fourcc_bayer, - uint8* dst_rgb, int dst_pitch_rgb, - int width, int height); - -// Converts ARGB to any Bayer RGB format. -void ARGBToBayerRGB(const uint8* src_rgb, int src_pitch_rgb, - uint8* dst_bayer, int dst_pitch_bayer, - uint32 dst_fourcc_bayer, - int width, int height); - -} // namespace libyuv - -#endif // LIBYUV_INCLUDE_FORMATCONVERSION_H_ diff --git a/include/libyuv.h b/include/libyuv.h new file mode 100644 index 000000000..81af8c427 --- /dev/null +++ b/include/libyuv.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef LIBYUV_INCLUDE_LIBYUV_H_ +#define LIBYUV_INCLUDE_LIBYUV_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/convert.h" +#include "libyuv/cpu_id.h" +#include "libyuv/format_conversion.h" +#include "libyuv/general.h" +#include "libyuv/planar_functions.h" +#include "libyuv/scale.h" + +#endif // LIBYUV_INCLUDE_LIBYUV_H_ diff --git a/common/basic_types.h b/include/libyuv/basic_types.h similarity index 51% rename from common/basic_types.h rename to include/libyuv/basic_types.h index a553a3961..5adc2bfdb 100644 --- a/common/basic_types.h +++ b/include/libyuv/basic_types.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef LIBYUV_COMMON_BASIC_TYPES_H_ -#define LIBYUV_COMMON_BASIC_TYPES_H_ +#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ +#define INCLUDE_LIBYUV_BASIC_TYPES_H_ #include // for NULL, size_t @@ -17,11 +17,6 @@ #include // for uintptr_t #endif -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - - #ifndef INT_TYPES_DEFINED #define INT_TYPES_DEFINED #ifdef COMPILER_MSVC @@ -59,48 +54,15 @@ typedef unsigned short uint16; typedef unsigned char uint8; #endif // INT_TYPES_DEFINED -#ifdef WIN32 -typedef int socklen_t; -#endif - -namespace libyuv { - template inline T _min(T a, T b) { return (a > b) ? b : a; } - template inline T _max(T a, T b) { return (a < b) ? b : a; } - - // For wait functions that take a number of milliseconds, kForever indicates - // unlimited time. - const int kForever = -1; -} - // Detect compiler is for x86 or x64. #if defined(__x86_64__) || defined(_M_X64) || \ defined(__i386__) || defined(_M_IX86) #define CPU_X86 1 #endif -#ifdef WIN32 -#define alignof(t) __alignof(t) -#else // !WIN32 -#define alignof(t) __alignof__(t) -#endif // !WIN32 #define IS_ALIGNED(p, a) (0==(reinterpret_cast(p) & ((a)-1))) #define ALIGNP(p, t) \ (reinterpret_cast(((reinterpret_cast(p) + \ ((t)-1)) & ~((t)-1)))) -#ifndef UNUSED -#define UNUSED(x) Unused(static_cast(&x)) -#define UNUSED2(x,y) Unused(static_cast(&x)); Unused(static_cast(&y)) -#define UNUSED3(x,y,z) Unused(static_cast(&x)); Unused(static_cast(&y)); Unused(static_cast(&z)) -#define UNUSED4(x,y,z,a) Unused(static_cast(&x)); Unused(static_cast(&y)); Unused(static_cast(&z)); Unused(static_cast(&a)) -#define UNUSED5(x,y,z,a,b) Unused(static_cast(&x)); Unused(static_cast(&y)); Unused(static_cast(&z)); Unused(static_cast(&a)); Unused(static_cast(&b)) -inline void Unused(const void *) { } -#endif // UNUSED - -#if defined(__GNUC__) -#define GCC_ATTR(x) __attribute__ ((x)) -#else // !__GNUC__ -#define GCC_ATTR(x) -#endif // !__GNUC__ - -#endif // LIBYUV_COMMON_BASIC_TYPES_H_ +#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ diff --git a/include/convert.h b/include/libyuv/convert.h similarity index 96% rename from include/convert.h rename to include/libyuv/convert.h index 731f624cd..c08011ef5 100644 --- a/include/convert.h +++ b/include/libyuv/convert.h @@ -9,10 +9,10 @@ */ -#ifndef LIBYUV_INCLUDE_CONVERT_H_ -#define LIBYUV_INCLUDE_CONVERT_H_ +#ifndef INCLUDE_LIBYUV_CONVERT_H_ +#define INCLUDE_LIBYUV_CONVERT_H_ -#include "basic_types.h" +#include "libyuv/basic_types.h" namespace libyuv { @@ -106,4 +106,4 @@ NV12ToRGB565(const uint8* src_yplane, int src_ystride, } // namespace libyuv -#endif // LIBYUV_INCLUDE_CONVERT_H_ +#endif // INCLUDE_LIBYUV_CONVERT_H_ diff --git a/source/cpu_id.h b/include/libyuv/cpu_id.h similarity index 88% rename from source/cpu_id.h rename to include/libyuv/cpu_id.h index ae33238ba..efe17e23e 100644 --- a/source/cpu_id.h +++ b/include/libyuv/cpu_id.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef LIBYUV_SOURCE_CPU_ID_H_ -#define LIBYUV_SOURCE_CPU_ID_H_ +#ifndef INCLUDE_LIBYUV_CPU_ID_H_ +#define INCLUDE_LIBYUV_CPU_ID_H_ namespace libyuv { @@ -28,4 +28,4 @@ void MaskCpuFlagsForTest(int enable_flags); } // namespace libyuv -#endif // LIBYUV_SOURCE_CPU_ID_H_ +#endif // INCLUDE_LIBYUV_CPU_ID_H_ diff --git a/include/libyuv/format_conversion.h b/include/libyuv/format_conversion.h new file mode 100644 index 000000000..d3d36f388 --- /dev/null +++ b/include/libyuv/format_conversion.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_ +#define INCLUDE_LIBYUV_FORMATCONVERSION_H_ + +#include "libyuv/basic_types.h" + +namespace libyuv { + +// Converts any Bayer RGB format to I420. +int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, + uint32 src_fourcc_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Converts any Bayer RGB format to ARGB. +int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer, + uint32 src_fourcc_bayer, + uint8* dst_rgb, int dst_stride_rgb, + int width, int height); + +// Converts ARGB to any Bayer RGB format. +int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb, + uint8* dst_bayer, int dst_stride_bayer, + uint32 dst_fourcc_bayer, + int width, int height); + +} // namespace libyuv + +#endif // INCLUDE_LIBYUV_FORMATCONVERSION_H_ diff --git a/include/general.h b/include/libyuv/general.h similarity index 88% rename from include/general.h rename to include/libyuv/general.h index 9450e3782..3cd9d3234 100644 --- a/include/general.h +++ b/include/libyuv/general.h @@ -13,23 +13,22 @@ * General operations on YUV images. */ -#ifndef LIBYUV_INCLUDE_GENERAL_H_ -#define LIBYUV_INCLUDE_GENERAL_H_ +#ifndef INCLUDE_LIBYUV_GENERAL_H_ +#define INCLUDE_LIBYUV_GENERAL_H_ -#include "basic_types.h" +#include "libyuv/basic_types.h" namespace libyuv { // Supported rotation -enum RotationMode -{ +enum RotationMode { kRotateNone = 0, kRotateClockwise = 90, kRotateCounterClockwise = 270, kRotate180 = 180, }; -// I420 mirror +// I420 mirror int I420Mirror(const uint8* src_yplane, int src_ystride, const uint8* src_uplane, int src_ustride, @@ -45,7 +44,7 @@ I420CropPad(const uint8* src_frame, int src_width, int src_height, uint8* dst_frame, int dst_width, int dst_height); -// I420 Crop - make a center cut +// I420 Crop - crop a rectangle from image int I420Crop(uint8* frame, int src_width, int src_height, @@ -62,8 +61,6 @@ I420Rotate(const uint8* src_yplane, int src_ystride, int width, int height, RotationMode mode); - } // namespace libyuv - -#endif // LIBYUV_INCLUDE_GENERAL_H_ +#endif // INCLUDE_LIBYUV_GENERAL_H_ diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h new file mode 100644 index 000000000..8b8b25240 --- /dev/null +++ b/include/libyuv/planar_functions.h @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ +#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ + +#include "libyuv/basic_types.h" + +namespace libyuv { + +// Copy I420 to I420. +int I420Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert I422 to I420. Used by MJPG. +int I422ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert M420 to I420. +int M420ToI420(const uint8* src_m420, int src_stride_m420, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert Q420 to I420. +int Q420ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert NV12 to I420. Also used for NV21. +int NV12ToI420(const uint8* src_y, + const uint8* src_uv, int src_stride, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert YUY2 to I420. +int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert UYVY to I420. +int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert I420 to ARGB. +int I420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I420 to BGRA. +int I420ToBGRA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I420 to ABGR. +int I420ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I422 to ARGB. +int I422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I444 to ARGB. +int I444ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I400 to ARGB. +int I400ToARGB(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I400 to ARGB. +int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert RAW to ARGB. +int RAWToARGB(const uint8* src_raw, int src_stride_raw, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert BG24 to ARGB. +int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert ABGR to ARGB. +int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert BGRA to ARGB. +int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +} // namespace libyuv + +#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ diff --git a/include/libyuv/scale.h b/include/libyuv/scale.h new file mode 100644 index 000000000..8433908b9 --- /dev/null +++ b/include/libyuv/scale.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_H_ +#define INCLUDE_LIBYUV_SCALE_H_ + +#include "libyuv/basic_types.h" + +namespace libyuv { + +// Supported filtering +enum FilterMode { + kFilterNone = 0, // Point sample; Fastest + kFilterBilinear = 1, // Faster than box, but lower quality scaling down. + kFilterBox = 2 // Highest quality +}; + +// Scales a YUV 4:2:0 image from the src width and height to the +// dst width and height. +// If filtering is kFilterNone, a simple nearest-neighbor algorithm is +// used. This produces basic (blocky) quality at the fastest speed. +// If filtering is kFilterBilinear, interpolation is used to produce a better +// quality image, at the expense of speed. +// If filtering is kFilterBox, averaging is used to produce ever better +// quality image, at further expense of speed. +// Returns 0 if successful. + +int I420Scale(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int dst_width, int dst_height, + FilterMode filtering); + +// Legacy API +// If dst_height_offset is non-zero, the image is offset by that many pixels +// and stretched to (dst_height - dst_height_offset * 2) pixels high, +// instead of dst_height. +int Scale(const uint8* src, int src_width, int src_height, + uint8* dst, int dst_width, int dst_height, int dst_height_offset, + bool interpolate); + +// Same, but specified src terms of each plane location and stride. +int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, + int src_stride_y, int src_stride_u, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, uint8* dst_u, uint8* dst_v, + int dst_stride_y, int dst_stride_u, int dst_stride_v, + int dst_width, int dst_height, + bool interpolate); + +// For testing, allow disabling of optimizations. +void SetUseReferenceImpl(bool use); + +} // namespace libyuv + +#endif // INCLUDE_LIBYUV_SCALE_H_ diff --git a/include/planar_functions.h b/include/planar_functions.h deleted file mode 100644 index 1a5b48380..000000000 --- a/include/planar_functions.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef LIBYUV_INCLUDE_PLANAR_FUNCTIONS_H_ -#define LIBYUV_INCLUDE_PLANAR_FUNCTIONS_H_ - -#include "basic_types.h" - -namespace libyuv { - -// Copy I420 to I420. -void I420Copy(const uint8* src_y, int src_pitch_y, - const uint8* src_u, int src_pitch_u, - const uint8* src_v, int src_pitch_v, - uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - int width, int height); - -// Convert I422 to I420. Used by MJPG. -void I422ToI420(const uint8* src_y, int src_pitch_y, - const uint8* src_u, int src_pitch_u, - const uint8* src_v, int src_pitch_v, - uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - int width, int height); - -// Convert M420 to I420. -void M420ToI420(const uint8* src_m420, int src_pitch_m420, - uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - int width, int height); - -// Convert Q420 to I420. -void Q420ToI420(const uint8* src_y, int src_pitch_y, - const uint8* src_yuy2, int src_pitch_yuy2, - uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - int width, int height); - -// Convert NV12 to I420. Also used for NV21. -void NV12ToI420(const uint8* src_y, - const uint8* src_uv, int src_pitch, - uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - int width, int height); - -// Convert YUY2 to I420. -void YUY2ToI420(const uint8* src_yuy2, int src_pitch_yuy2, - uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - int width, int height); - -// Convert UYVY to I420. -void UYVYToI420(const uint8* src_uyvy, int src_pitch_uyvy, - uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - int width, int height); - -// Convert I420 to ARGB. -void I420ToARGB(const uint8* src_y, int src_pitch_y, - const uint8* src_u, int src_pitch_u, - const uint8* src_v, int src_pitch_v, - uint8* dst_argb, int dst_pitch_argb, - int width, int height); - -// Convert I420 to BGRA. -void I420ToBGRA(const uint8* src_y, int src_pitch_y, - const uint8* src_u, int src_pitch_u, - const uint8* src_v, int src_pitch_v, - uint8* dst_argb, int dst_pitch_argb, - int width, int height); - -// Convert I420 to ABGR. -void I420ToABGR(const uint8* src_y, int src_pitch_y, - const uint8* src_u, int src_pitch_u, - const uint8* src_v, int src_pitch_v, - uint8* dst_argb, int dst_pitch_argb, - int width, int height); - -// Convert I422 to ARGB. -void I422ToARGB(const uint8* src_y, int src_pitch_y, - const uint8* src_u, int src_pitch_u, - const uint8* src_v, int src_pitch_v, - uint8* dst_argb, int dst_pitch_argb, - int width, int height); - -// Convert I444 to ARGB. -void I444ToARGB(const uint8* src_y, int src_pitch_y, - const uint8* src_u, int src_pitch_u, - const uint8* src_v, int src_pitch_v, - uint8* dst_argb, int dst_pitch_argb, - int width, int height); - -// Convert I400 to ARGB. -void I400ToARGB(const uint8* src_y, int src_pitch_y, - uint8* dst_argb, int dst_pitch_argb, - int width, int height); - -// Convert I400 to ARGB. -void I400ToARGB_Reference(const uint8* src_y, int src_pitch_y, - uint8* dst_argb, int dst_pitch_argb, - int width, int height); - -// Convert RAW to ARGB. -void RAWToARGB(const uint8* src_raw, int src_pitch_raw, - uint8* dst_argb, int dst_pitch_argb, - int width, int height); - -// Convert BG24 to ARGB. -void BG24ToARGB(const uint8* src_bg24, int src_pitch_bg24, - uint8* dst_argb, int dst_pitch_argb, - int width, int height); - -// Convert ABGR to ARGB. -void ABGRToARGB(const uint8* src_abgr, int src_pitch_abgr, - uint8* dst_argb, int dst_pitch_argb, - int width, int height); - -} // namespace libyuv - -#endif // LIBYUV_INCLUDE_PLANAR_FUNCTIONS_H_ diff --git a/include/scale.h b/include/scale.h deleted file mode 100644 index 9cef9bce8..000000000 --- a/include/scale.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef LIBYUV_INCLUDE_SCALE_H_ -#define LIBYUV_INCLUDE_SCALE_H_ - -#include "basic_types.h" - -#if defined(_MSC_VER) -#define ALIGN16(var) __declspec(align(16)) var -#else -#define ALIGN16(var) var __attribute__((aligned(16))) -#endif - -namespace libyuv { - -// Scales a YUV 4:2:0 image from the input width and height to the -// output width and height. If outh_offset is nonzero, the image is -// offset by that many pixels and stretched to (outh - outh_offset * 2) -// pixels high, instead of outh. -// If interpolate is not set, a simple nearest-neighbor algorithm is -// used. This produces basic (blocky) quality at the fastest speed. -// If interpolate is set, interpolation is used to produce a better -// quality image, at the expense of speed. -// Returns true if successful. -bool Scale(const uint8 *in, int32 inw, int32 inh, - uint8 *out, int32 outw, int32 outh, int32 outh_offset, - bool interpolate); - -// Same, but specified in terms of each plane location and stride. -bool Scale(const uint8 *inY, const uint8 *inU, const uint8 *inV, - int32 istrideY, int32 istrideU, int32 istrideV, - int32 iwidth, int32 iheight, - uint8 *outY, uint8 *outU, uint8 *outV, - int32 ostrideY, int32 ostrideU, int32 ostrideV, - int32 owidth, int32 oheight, - bool interpolate); - -// For testing, allow disabling of optimizations. -void SetUseReferenceImpl(bool use); - -} // namespace libyuv - -#endif // LIBYUV_INCLUDE_SCALE_H_ diff --git a/source/conversion_tables.h b/source/conversion_tables.h index e778fa2d0..9a328649c 100644 --- a/source/conversion_tables.h +++ b/source/conversion_tables.h @@ -15,11 +15,11 @@ * ***************************************************************/ -#ifndef WEBRTC_COMMON_VIDEO_VPLIB_CONVERSION_TABLES -#define WEBRTC_COMMON_VIDEO_VPLIB_CONVERSION_TABLES +#ifndef LIBYUV_SOURCE_CONVERSION_TABLES_H_ +#define LIBYUV_SOURCE_CONVERSION_TABLES_H_ + +namespace libyuv { -namespace libyuv -{ /****************************************************************************** * YUV TO RGB approximation * @@ -97,7 +97,6 @@ namespace libyuv Ucg(244),Ucg(245),Ucg(246),Ucg(247),Ucg(248),Ucg(249),Ucg(250),Ucg(251), Ucg(252),Ucg(253),Ucg(254),Ucg(255)}; - static const int mapUcb[256] = { Ucb(0),Ucb(1),Ucb(2),Ucb(3),Ucb(4),Ucb(5),Ucb(6),Ucb(7),Ucb(8),Ucb(9), Ucb(10),Ucb(11),Ucb(12),Ucb(13),Ucb(14),Ucb(15),Ucb(16),Ucb(17),Ucb(18), @@ -199,5 +198,6 @@ namespace libyuv Vcg(252),Vcg(253),Vcg(254),Vcg(255)}; } // namespace libyuv + #endif diff --git a/source/convert.cc b/source/convert.cc index ddaa51f22..e555c5440 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -8,13 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "libyuv/convert.h" -#include "convert.h" -#include "basic_types.h" - -#include // memcpy(), memset() -#include -#include // abs +#include "libyuv/basic_types.h" +#include "conversion_tables.h" //#define SCALEOPT //Currently for windows only. June 2010 @@ -22,20 +19,16 @@ #include #endif -#include "conversion_tables.h" - -namespace libyuv -{ - - -// Clip value to [0,255] -inline uint8 Clip(int32 val); - -#ifdef SCALEOPT -void *memcpy_16(void * dest, const void * src, size_t n); -void *memcpy_8(void * dest, const void * src, size_t n); -#endif +namespace libyuv { +static inline uint8 Clip(int32 val) { + if (val < 0) { + return (uint8) 0; + } else if (val > 255){ + return (uint8) 255; + } + return (uint8) val; +} int I420ToRGB24(const uint8* src_yplane, int src_ystride, @@ -344,8 +337,8 @@ I420ToYUY2(const uint8* src_yplane, int src_ystride, const uint8* in1 = src_yplane; const uint8* in2 = src_yplane + src_ystride ; - const uint8* inU = src_uplane; - const uint8* inV = src_vplane; + const uint8* src_u = src_uplane; + const uint8* src_v = src_vplane; uint8* out1 = dst_frame; uint8* out2 = dst_frame + 2 * dst_stride; @@ -356,25 +349,25 @@ I420ToYUY2(const uint8* src_yplane, int src_ystride, for (int i = 0; i < ((src_height + 1) >> 1); i++){ for (int j = 0; j < ((src_width + 1) >> 1); j++){ out1[0] = in1[0]; - out1[1] = *inU; + out1[1] = *src_u; out1[2] = in1[1]; - out1[3] = *inV; + out1[3] = *src_v; out2[0] = in2[0]; - out2[1] = *inU; + out2[1] = *src_u; out2[2] = in2[1]; - out2[3] = *inV; + out2[3] = *src_v; out1 += 4; out2 += 4; - inU++; - inV++; + src_u++; + src_v++; in1 += 2; in2 += 2; } in1 += 2 * src_ystride - src_width; in2 += 2 * src_ystride - src_width; - inU += src_ustride - ((src_width + 1) >> 1); - inV += src_vstride - ((src_width + 1) >> 1); + src_u += src_ustride - ((src_width + 1) >> 1); + src_v += src_vstride - ((src_width + 1) >> 1); out1 += 2 * dst_stride + 2 * (dst_stride - src_width); out2 += 2 * dst_stride + 2 * (dst_stride - src_width); } @@ -387,34 +380,34 @@ I420ToYUY2(const uint8* src_yplane, int src_ystride, ;pusha mov eax, DWORD PTR [in1] ;1939.33 mov ecx, DWORD PTR [in2] ;1939.33 - mov ebx, DWORD PTR [inU] ;1939.33 - mov edx, DWORD PTR [inV] ;1939.33 + mov ebx, DWORD PTR [src_u] ;1939.33 + mov edx, DWORD PTR [src_v] ;1939.33 loop0: - movq xmm6, QWORD PTR [ebx] ;inU - movq xmm0, QWORD PTR [edx] ;inV - punpcklbw xmm6, xmm0 ;inU, inV mix + movq xmm6, QWORD PTR [ebx] ;src_u + movq xmm0, QWORD PTR [edx] ;src_v + punpcklbw xmm6, xmm0 ;src_u, src_v mix ;movdqa xmm1, xmm6 ;movdqa xmm2, xmm6 ;movdqa xmm4, xmm6 movdqu xmm3, XMMWORD PTR [eax] ;in1 movdqa xmm1, xmm3 - punpcklbw xmm1, xmm6 ;in1, inU, in1, inV + punpcklbw xmm1, xmm6 ;in1, src_u, in1, src_v mov esi, DWORD PTR [out1] movdqu XMMWORD PTR [esi], xmm1 ;write to out1 movdqu xmm5, XMMWORD PTR [ecx] ;in2 movdqa xmm2, xmm5 - punpcklbw xmm2, xmm6 ;in2, inU, in2, inV + punpcklbw xmm2, xmm6 ;in2, src_u, in2, src_v mov edi, DWORD PTR [out2] movdqu XMMWORD PTR [edi], xmm2 ;write to out2 - punpckhbw xmm3, xmm6 ;in1, inU, in1, inV again + punpckhbw xmm3, xmm6 ;in1, src_u, in1, src_v again movdqu XMMWORD PTR [esi+16], xmm3 ;write to out1 again add esi, 32 mov DWORD PTR [out1], esi - punpckhbw xmm5, xmm6 ;inU, in2, inV again + punpckhbw xmm5, xmm6 ;src_u, in2, src_v again movdqu XMMWORD PTR [edi+16], xmm5 ;write to out2 again add edi, 32 mov DWORD PTR [out2], edi @@ -431,8 +424,8 @@ I420ToYUY2(const uint8* src_yplane, int src_ystride, mov DWORD PTR [in1], eax ;1939.33 mov DWORD PTR [in2], ecx ;1939.33 - mov DWORD PTR [inU], ebx ;1939.33 - mov DWORD PTR [inV], edx ;1939.33 + mov DWORD PTR [src_u], ebx ;1939.33 + mov DWORD PTR [src_v], edx ;1939.33 ;popa emms @@ -504,32 +497,32 @@ I420ToUYVY(const uint8* src_yplane, int src_ystride, ;pusha mov eax, DWORD PTR [in1] ;1939.33 mov ecx, DWORD PTR [in2] ;1939.33 - mov ebx, DWORD PTR [inU] ;1939.33 - mov edx, DWORD PTR [inV] ;1939.33 + mov ebx, DWORD PTR [src_u] ;1939.33 + mov edx, DWORD PTR [src_v] ;1939.33 loop0: - movq xmm6, QWORD PTR [ebx] ;inU - movq xmm0, QWORD PTR [edx] ;inV - punpcklbw xmm6, xmm0 ;inU, inV mix + movq xmm6, QWORD PTR [ebx] ;src_u + movq xmm0, QWORD PTR [edx] ;src_v + punpcklbw xmm6, xmm0 ;src_u, src_v mix movdqa xmm1, xmm6 movdqa xmm2, xmm6 movdqa xmm4, xmm6 movdqu xmm3, XMMWORD PTR [eax] ;in1 - punpcklbw xmm1, xmm3 ;inU, in1, inV + punpcklbw xmm1, xmm3 ;src_u, in1, src_v mov esi, DWORD PTR [out1] movdqu XMMWORD PTR [esi], xmm1 ;write to out1 movdqu xmm5, XMMWORD PTR [ecx] ;in2 - punpcklbw xmm2, xmm5 ;inU, in2, inV + punpcklbw xmm2, xmm5 ;src_u, in2, src_v mov edi, DWORD PTR [out2] movdqu XMMWORD PTR [edi], xmm2 ;write to out2 - punpckhbw xmm4, xmm3 ;inU, in1, inV again + punpckhbw xmm4, xmm3 ;src_u, in1, src_v again movdqu XMMWORD PTR [esi+16], xmm4 ;write to out1 again add esi, 32 mov DWORD PTR [out1], esi - punpckhbw xmm6, xmm5 ;inU, in2, inV again + punpckhbw xmm6, xmm5 ;src_u, in2, src_v again movdqu XMMWORD PTR [edi+16], xmm6 ;write to out2 again add edi, 32 mov DWORD PTR [out2], edi @@ -546,8 +539,8 @@ loop0: mov DWORD PTR [in1], eax ;1939.33 mov DWORD PTR [in2], ecx ;1939.33 - mov DWORD PTR [inU], ebx ;1939.33 - mov DWORD PTR [inV], edx ;1939.33 + mov DWORD PTR [src_u], ebx ;1939.33 + mov DWORD PTR [src_v], edx ;1939.33 ;popa emms @@ -848,62 +841,4 @@ RAWToI420(const uint8* src_frame, int src_stride, src_width, src_height, RAWToI420Row_C); } -inline -uint8 Clip(int32 val) -{ - if (val < 0){ - return (uint8)0; - } else if (val > 255){ - return (uint8)255; - } - return (uint8)val; -} - -#ifdef SCALEOPT -//memcpy_16 assumes that width is an integer multiple of 16! -void -*memcpy_16(void * dest, const void * src, size_t n) -{ - _asm - { - mov eax, dword ptr [src] - mov ebx, dword ptr [dest] - mov ecx, dword ptr [n] - - loop0: - - movdqu xmm0, XMMWORD PTR [eax] - movdqu XMMWORD PTR [ebx], xmm0 - add eax, 16 - add ebx, 16 - sub ecx, 16 - jg loop0 - } -} - -// memcpy_8 assumes that width is an integer multiple of 8! -void -*memcpy_8(void * dest, const void * src, size_t n) -{ - _asm - { - mov eax, dword ptr [src] - mov ebx, dword ptr [dest] - mov ecx, dword ptr [n] - - loop0: - - movq mm0, QWORD PTR [eax] - movq QWORD PTR [ebx], mm0 - add eax, 8 - add ebx, 8 - sub ecx, 8 - jg loop0 - emms - } - -} - -#endif - } // namespace libyuv diff --git a/source/cpu_id.cc b/source/cpu_id.cc index 6d8655c12..e986015ea 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "cpu_id.h" -#include "basic_types.h" // for CPU_X86 +#include "libyuv/cpu_id.h" +#include "libyuv/basic_types.h" // for CPU_X86 #ifdef _MSC_VER #include diff --git a/source/format_conversion.cc b/source/format_conversion.cc index 0db57ae4c..a058d6a9e 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -10,8 +10,7 @@ #include -#include "common.h" -#include "cpu_id.h" +#include "libyuv/cpu_id.h" #include "video_common.h" namespace libyuv { @@ -19,6 +18,15 @@ namespace libyuv { // Most code in here is inspired by the material at // http://www.siliconimaging.com/RGB%20Bayer.htm +// Forces compiler to inline, even against its better judgement. Use wisely. +#if defined(__GNUC__) +#define FORCE_INLINE __attribute__((always_inline)) +#elif defined(WIN32) +#define FORCE_INLINE __forceinline +#else +#define FORCE_INLINE +#endif + enum { RED = 0, BLUE = 1, @@ -98,7 +106,7 @@ static FORCE_INLINE void InterpolateBayerRGBCorner(uint8* r, uint8* g, uint8* b, const uint8* src, - int src_pitch, + int src_stride, Position pos, uint8 colour) { @@ -108,20 +116,20 @@ static FORCE_INLINE void InterpolateBayerRGBCorner(uint8* r, int adjacent_column; switch (pos) { case TOP_LEFT: - adjacent_row = src_pitch; + adjacent_row = src_stride; adjacent_column = 1; break; case TOP_RIGHT: - adjacent_row = src_pitch; + adjacent_row = src_stride; adjacent_column = -1; break; case BOTTOM_LEFT: - adjacent_row = -src_pitch; + adjacent_row = -src_stride; adjacent_column = 1; break; case BOTTOM_RIGHT: default: - adjacent_row = -src_pitch; + adjacent_row = -src_stride; adjacent_column = -1; break; } @@ -161,7 +169,7 @@ static FORCE_INLINE void InterpolateBayerRGBEdge(uint8* r, uint8* g, uint8* b, const uint8* src, - int src_pitch, + int src_stride, Position pos, uint8 colour) { @@ -176,21 +184,21 @@ static FORCE_INLINE void InterpolateBayerRGBEdge(uint8* r, switch (pos) { case TOP_EDGE: - inner = src_pitch; + inner = src_stride; side = 1; break; case RIGHT_EDGE: inner = -1; - side = src_pitch; + side = src_stride; break; case BOTTOM_EDGE: - inner = -src_pitch; + inner = -src_stride; side = 1; break; case LEFT_EDGE: default: inner = 1; - side = src_pitch; + side = src_stride; break; } @@ -234,7 +242,7 @@ static FORCE_INLINE void InterpolateBayerRGBCenter(uint8* r, uint8* g, uint8* b, const uint8* src, - int src_pitch, + int src_stride, uint8 colour) { if (IsRedBlue(colour)) { @@ -245,12 +253,12 @@ static FORCE_INLINE void InterpolateBayerRGBCenter(uint8* r, // quality here by using only two of the green pixels based on the // correlation to the nearby red/blue pixels, but that is slower and would // result in more edge cases. - *g = (src[1] + src[-1] + src[src_pitch] + src[-src_pitch]) / 4; + *g = (src[1] + src[-1] + src[src_stride] + src[-src_stride]) / 4; // Average of the oppositely-coloured corner pixels (there's four). - uint8 corner_average = (src[src_pitch + 1] + - src[src_pitch - 1] + - src[-src_pitch + 1] + - src[-src_pitch - 1]) / 4; + uint8 corner_average = (src[src_stride + 1] + + src[src_stride - 1] + + src[-src_stride + 1] + + src[-src_stride - 1]) / 4; if (colour == RED) { *r = current_pixel; *b = corner_average; @@ -263,7 +271,7 @@ static FORCE_INLINE void InterpolateBayerRGBCenter(uint8* r, // Average of the adjacent same-row pixels (there's two). uint8 row_adjacent = (src[1] + src[-1]) / 2; // Average of the adjacent same-column pixels (there's two). - uint8 column_adjacent = (src[src_pitch] + src[-src_pitch]) / 2; + uint8 column_adjacent = (src[src_stride] + src[-src_stride]) / 2; if (colour == GREEN_BETWEEN_RED) { *r = row_adjacent; *b = column_adjacent; @@ -275,15 +283,15 @@ static FORCE_INLINE void InterpolateBayerRGBCenter(uint8* r, } // Converts any Bayer RGB format to ARGB. -void BayerRGBToARGB(const uint8* src, int src_pitch, uint32 src_fourcc, - uint8* dst, int dst_pitch, - int width, int height) { +int BayerRGBToARGB(const uint8* src, int src_stride, uint32 src_fourcc, + uint8* dst, int dst_stride, + int width, int height) { assert(width % 2 == 0); assert(height % 2 == 0); uint32 colour_map = FourCcToBayerPixelColourMap(src_fourcc); - int src_row_inc = src_pitch * 2 - width; - int dst_row_inc = dst_pitch * 2 - width * 4; + int src_row_inc = src_stride * 2 - width; + int dst_row_inc = dst_stride * 2 - width * 4; // Iterate over the 2x2 grids. for (int y1 = 0; y1 < height; y1 += 2) { @@ -297,24 +305,24 @@ void BayerRGBToARGB(const uint8* src, int src_pitch, uint32 src_fourcc, uint8 current_colour = static_cast(colours); colours >>= 8; Position pos = GetPosition(x1 + x2, y1 + y2, width, height); - const uint8* src_pixel = &src[y2 * src_pitch + x2]; - uint8* dst_pixel = &dst[y2 * dst_pitch + x2 * 4]; + const uint8* src_pixel = &src[y2 * src_stride + x2]; + uint8* dst_pixel = &dst[y2 * dst_stride + x2 * 4]; // Convert from Bayer RGB to regular RGB. if (pos == MIDDLE) { // 99% of the image is the middle. InterpolateBayerRGBCenter(&r, &g, &b, - src_pixel, src_pitch, + src_pixel, src_stride, current_colour); } else if (pos >= LEFT_EDGE) { // Next most frequent is edges. InterpolateBayerRGBEdge(&r, &g, &b, - src_pixel, src_pitch, pos, + src_pixel, src_stride, pos, current_colour); } else { // Last is the corners. There are only 4. InterpolateBayerRGBCorner(&r, &g, &b, - src_pixel, src_pitch, pos, + src_pixel, src_stride, pos, current_colour); } @@ -331,23 +339,24 @@ void BayerRGBToARGB(const uint8* src, int src_pitch, uint32 src_fourcc, src += src_row_inc; dst += dst_row_inc; } + return 0; } // Converts any Bayer RGB format to I420. -void BayerRGBToI420(const uint8* src, int src_pitch, uint32 src_fourcc, - uint8* y, int y_pitch, - uint8* u, int u_pitch, - uint8* v, int v_pitch, - int width, int height) { +int BayerRGBToI420(const uint8* src, int src_stride, uint32 src_fourcc, + uint8* y, int y_stride, + uint8* u, int u_stride, + uint8* v, int v_stride, + int width, int height) { assert(width % 2 == 0); assert(height % 2 == 0); uint32 colour_map = FourCcToBayerPixelColourMap(src_fourcc); - int src_row_inc = src_pitch * 2 - width; - int y_row_inc = y_pitch * 2 - width; - int u_row_inc = u_pitch - width / 2; - int v_row_inc = v_pitch - width / 2; + int src_row_inc = src_stride * 2 - width; + int y_row_inc = y_stride * 2 - width; + int u_row_inc = u_stride - width / 2; + int v_row_inc = v_stride - width / 2; // Iterate over the 2x2 grids. for (int y1 = 0; y1 < height; y1 += 2) { @@ -363,25 +372,25 @@ void BayerRGBToI420(const uint8* src, int src_pitch, uint32 src_fourcc, uint8 current_colour = static_cast(colours); colours >>= 8; Position pos = GetPosition(x1 + x2, y1 + y2, width, height); - const uint8* src_pixel = &src[y2 * src_pitch + x2]; - uint8* y_pixel = &y[y2 * y_pitch + x2]; + const uint8* src_pixel = &src[y2 * src_stride + x2]; + uint8* y_pixel = &y[y2 * y_stride + x2]; // Convert from Bayer RGB to regular RGB. if (pos == MIDDLE) { // 99% of the image is the middle. InterpolateBayerRGBCenter(&r, &g, &b, - src_pixel, src_pitch, + src_pixel, src_stride, current_colour); } else if (pos >= LEFT_EDGE) { // Next most frequent is edges. InterpolateBayerRGBEdge(&r, &g, &b, - src_pixel, src_pitch, pos, + src_pixel, src_stride, pos, current_colour); } else { // Last is the corners. There are only 4. InterpolateBayerRGBCorner(&r, &g, &b, - src_pixel, src_pitch, pos, + src_pixel, src_stride, pos, current_colour); } @@ -405,6 +414,7 @@ void BayerRGBToI420(const uint8* src, int src_pitch, uint32 src_fourcc, u += u_row_inc; v += v_row_inc; } + return 0; } // Note: to do this with Neon vld4.8 would load ARGB values into 4 registers @@ -490,18 +500,18 @@ static uint32 GenerateSelector(int select0, int select1) { } // Converts 32 bit ARGB to any Bayer RGB format. -void ARGBToBayerRGB(const uint8* src_rgb, int src_pitch_rgb, - uint8* dst_bayer, int dst_pitch_bayer, - uint32 dst_fourcc_bayer, - int width, int height) { +int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb, + uint8* dst_bayer, int dst_stride_bayer, + uint32 dst_fourcc_bayer, + int width, int height) { assert(width % 2 == 0); void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix); #if defined(HAS_ARGBTOBAYERROW_SSSE3) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && (width % 4 == 0) && - IS_ALIGNED(src_rgb, 16) && (src_pitch_rgb % 16 == 0) && - IS_ALIGNED(dst_bayer, 4) && (dst_pitch_bayer % 4 == 0)) { + IS_ALIGNED(src_rgb, 16) && (src_stride_rgb % 16 == 0) && + IS_ALIGNED(dst_bayer, 4) && (dst_stride_bayer % 4 == 0)) { ARGBToBayerRow = ARGBToBayerRow_SSSE3; } else #endif @@ -540,9 +550,10 @@ void ARGBToBayerRGB(const uint8* src_rgb, int src_pitch_rgb, // Now convert. for (int y = 0; y < height; ++y) { ARGBToBayerRow(src_rgb, dst_bayer, index_map[y & 1], width); - src_rgb += src_pitch_rgb; - dst_bayer += dst_pitch_bayer; + src_rgb += src_stride_rgb; + dst_bayer += dst_stride_bayer; } + return 0; } } // namespace libyuv diff --git a/source/general.cc b/source/general.cc index 0759db854..27f97bdc4 100644 --- a/source/general.cc +++ b/source/general.cc @@ -8,14 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "general.h" +#include "libyuv/general.h" #include // memcpy(), memset() -#include "planar_functions.h" +#include "libyuv/planar_functions.h" #include "rotate.h" - namespace libyuv { int @@ -25,11 +24,11 @@ I420Mirror(const uint8* src_yplane, int src_ystride, uint8* dst_yplane, int dst_ystride, uint8* dst_uplane, int dst_ustride, uint8* dst_vplane, int dst_vstride, - int width, int height) -{ + int width, int height) { if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL || - dst_yplane == NULL || dst_uplane == NULL || dst_vplane == NULL) + dst_yplane == NULL || dst_uplane == NULL || dst_vplane == NULL) { return -1; + } int indO = 0; int indS = 0; @@ -39,8 +38,8 @@ I420Mirror(const uint8* src_yplane, int src_ystride, const int halfWidth = (width + 1) >> 1; // Y - for (wind = 0; wind < halfWidth; wind++){ - for (hind = 0; hind < height; hind++){ + for (wind = 0; wind < halfWidth; wind++) { + for (hind = 0; hind < height; hind++) { indO = hind * src_ystride + wind; indS = hind * dst_ystride + (width - wind - 1); tmpVal = src_yplane[indO]; @@ -53,8 +52,8 @@ I420Mirror(const uint8* src_yplane, int src_ystride, const int halfSrcuvStride = (height + 1) >> 1; const int halfuvWidth = (width + 1) >> 2; - for (wind = 0; wind < halfuvWidth; wind++){ - for (hind = 0; hind < halfHeight; hind++){ + for (wind = 0; wind < halfuvWidth; wind++) { + for (hind = 0; hind < halfHeight; hind++) { indO = hind * halfSrcuvStride + wind; indS = hind * halfSrcuvStride + (halfuvWidth - wind - 1); // U @@ -79,11 +78,11 @@ I420Crop(uint8* frame, if (frame == NULL) return -1; - if (src_width == dst_width && src_height == dst_height){ + if (src_width == dst_width && src_height == dst_height) { // Nothing to do return 3 * dst_height * dst_width / 2; } - if (dst_width > src_width || dst_height > src_height){ + if (dst_width > src_width || dst_height > src_height) { // error return -1; } @@ -98,21 +97,21 @@ I420Crop(uint8* frame, int crop_width = ( src_width - dst_width ) / 2; for (i = src_width * crop_height + crop_width; loop < dst_height ; - loop++, i += src_width){ + loop++, i += src_width) { memcpy(&frame[m],&frame[i],dst_width); m += dst_width; } i = src_width * src_height; // ilum loop = 0; for ( i += (halfsrc_width * crop_height / 2 + crop_width / 2); - loop < halfdst_height; loop++,i += halfsrc_width){ + loop < halfdst_height; loop++,i += halfsrc_width) { memcpy(&frame[m],&frame[i],half_dst_width); m += half_dst_width; } loop = 0; i = src_width * src_height + half_dst_height * halfsrc_width; // ilum + Cr for ( i += (halfsrc_width * crop_height / 2 + crop_width / 2); - loop < halfdst_height; loop++, i += halfsrc_width){ + loop < halfdst_height; loop++, i += halfsrc_width) { memcpy(&frame[m],&frame[i],half_dst_width); m += half_dst_width; } @@ -122,66 +121,59 @@ I420Crop(uint8* frame, int I420CropPad(const uint8* src_frame, int src_width, - int src_height, uint8* dst_frame, - int dst_width, int dst_height) + int src_height, uint8* dst_frame, + int dst_width, int dst_height) { - if (src_width < 1 || dst_width < 1 || src_height < 1 || dst_height < 1 ) + if (src_width < 1 || dst_width < 1 || src_height < 1 || dst_height < 1) { return -1; - if (src_width == dst_width && src_height == dst_height) + } + if (src_width == dst_width && src_height == dst_height) { memcpy(dst_frame, src_frame, 3 * dst_width * (dst_height >> 1)); - else - { - if ( src_height < dst_height){ + } else { + if (src_height < dst_height) { // pad height int pad_height = dst_height - src_height; int i = 0; int pad_width = 0; int crop_width = 0; int width = src_width; - if (src_width < dst_width){ + if (src_width < dst_width) { // pad width pad_width = dst_width - src_width; - } else{ - // cut width - crop_width = src_width - dst_width; - width = dst_width; + } else { + // cut width + crop_width = src_width - dst_width; + width = dst_width; } - if (pad_height){ + if (pad_height) { memset(dst_frame, 0, dst_width * (pad_height >> 1)); dst_frame += dst_width * (pad_height >> 1); } - for (i = 0; i < src_height;i++) - { - if (pad_width) - { - memset(dst_frame, 0, pad_width / 2); - dst_frame += pad_width / 2; - } - src_frame += crop_width >> 1; // in case we have a cut - memcpy(dst_frame,src_frame ,width); - src_frame += crop_width >> 1; - dst_frame += width; - src_frame += width; - if (pad_width) - { + for (i = 0; i < src_height;i++) { + if (pad_width) { memset(dst_frame, 0, pad_width / 2); dst_frame += pad_width / 2; - } + } + src_frame += crop_width >> 1; // in case we have a cut + memcpy(dst_frame,src_frame ,width); + src_frame += crop_width >> 1; + dst_frame += width; + src_frame += width; + if (pad_width) { + memset(dst_frame, 0, pad_width / 2); + dst_frame += pad_width / 2; + } } - if (pad_height) - { - memset(dst_frame, 0, dst_width * (pad_height >> 1)); - dst_frame += dst_width * (pad_height >> 1); + if (pad_height) { + memset(dst_frame, 0, dst_width * (pad_height >> 1)); + dst_frame += dst_width * (pad_height >> 1); } - if (pad_height) - { + if (pad_height) { memset(dst_frame, 127, (dst_width >> 2) * (pad_height >> 1)); dst_frame += (dst_width >> 2) * (pad_height >> 1); } - for (i = 0; i < (src_height >> 1); i++) - { - if (pad_width) - { + for (i = 0; i < (src_height >> 1); i++) { + if (pad_width) { memset(dst_frame, 127, pad_width >> 2); dst_frame += pad_width >> 2; } @@ -190,21 +182,17 @@ I420CropPad(const uint8* src_frame, int src_width, src_frame += crop_width >> 2; dst_frame += width >> 1; src_frame += width >> 1; - if (pad_width) - { + if (pad_width) { memset(dst_frame, 127, pad_width >> 2); dst_frame += pad_width >> 2; } } - if (pad_height) - { + if (pad_height) { memset(dst_frame, 127, (dst_width >> 1) * (pad_height >> 1)); dst_frame += (dst_width >> 1) * (pad_height >> 1); } - for (i = 0; i < (src_height >> 1); i++) - { - if (pad_width) - { + for (i = 0; i < (src_height >> 1); i++) { + if (pad_width) { memset(dst_frame, 127, pad_width >> 2); dst_frame += pad_width >> 2; } @@ -213,32 +201,26 @@ I420CropPad(const uint8* src_frame, int src_width, src_frame += crop_width >> 2; dst_frame += width >> 1; src_frame += width >> 1; - if (pad_width) - { + if (pad_width) { memset(dst_frame, 127, pad_width >> 2); dst_frame += pad_width >> 2; } } - if (pad_height) - { + if (pad_height) { memset(dst_frame, 127, (dst_width >> 2) * (pad_height >> 1)); dst_frame += (dst_width >> 2) * (pad_height >> 1); } - } - else - { + } else { // cut height int i = 0; int pad_width = 0; int crop_width = 0; int width = src_width; - if (src_width < dst_width) - { + if (src_width < dst_width) { // pad width pad_width = dst_width - src_width; - } else - { + } else { // cut width crop_width = src_width - dst_width; width = dst_width; @@ -246,10 +228,8 @@ I420CropPad(const uint8* src_frame, int src_width, int diff_height = src_height - dst_height; src_frame += src_width * (diff_height >> 1); // skip top I - for (i = 0; i < dst_height; i++) - { - if (pad_width) - { + for (i = 0; i < dst_height; i++) { + if (pad_width) { memset(dst_frame, 0, pad_width / 2); dst_frame += pad_width / 2; } @@ -258,18 +238,15 @@ I420CropPad(const uint8* src_frame, int src_width, src_frame += crop_width >> 1; dst_frame += width; src_frame += width; - if (pad_width) - { + if (pad_width) { memset(dst_frame, 0, pad_width / 2); dst_frame += pad_width / 2; } } src_frame += src_width * (diff_height >> 1); // skip end I src_frame += (src_width >> 2) * (diff_height >> 1); // skip top of Cr - for (i = 0; i < (dst_height >> 1); i++) - { - if (pad_width) - { + for (i = 0; i < (dst_height >> 1); i++) { + if (pad_width) { memset(dst_frame, 127, pad_width >> 2); dst_frame += pad_width >> 2; } @@ -278,18 +255,15 @@ I420CropPad(const uint8* src_frame, int src_width, src_frame += crop_width >> 2; dst_frame += width >> 1; src_frame += width >> 1; - if (pad_width) - { + if (pad_width) { memset(dst_frame, 127, pad_width >> 2); dst_frame += pad_width >> 2; } } src_frame += (src_width >> 2) * (diff_height >> 1); // skip end of Cr src_frame += (src_width >> 2) * (diff_height >> 1); // skip top of Cb - for (i = 0; i < (dst_height >> 1); i++) - { - if (pad_width) - { + for (i = 0; i < (dst_height >> 1); i++) { + if (pad_width) { memset(dst_frame, 127, pad_width >> 2); dst_frame += pad_width >> 2; } @@ -298,8 +272,7 @@ I420CropPad(const uint8* src_frame, int src_width, src_frame += crop_width >> 2; dst_frame += width >> 1; src_frame += width >> 1; - if (pad_width) - { + if (pad_width) { memset(dst_frame, 127, pad_width >> 2); dst_frame += pad_width >> 2; } @@ -317,20 +290,17 @@ I420Rotate(const uint8* src_yplane, int src_ystride, uint8* dst_uplane, int dst_ustride, uint8* dst_vplane, int dst_vstride, int width, int height, - RotationMode mode) -{ - switch (mode){ - // TODO: should return int + RotationMode mode) { + switch (mode) { case kRotateNone: // copy frame - I420Copy(src_yplane, src_ystride, - src_uplane, src_ustride, - src_vplane, src_vstride, - dst_yplane, dst_ystride, - dst_uplane, dst_ustride, - dst_vplane, dst_vstride, - width, height); - return 0; + return I420Copy(src_yplane, src_ystride, + src_uplane, src_ustride, + src_vplane, src_vstride, + dst_yplane, dst_ystride, + dst_uplane, dst_ustride, + dst_vplane, dst_vstride, + width, height); break; case kRotateClockwise: Rotate90(src_yplane, src_ystride, @@ -374,4 +344,4 @@ I420Rotate(const uint8* src_yplane, int src_ystride, } } -} // nmaespace libyuv +} // namespace libyuv diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 56683f7ce..0b590c8f1 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -9,11 +9,11 @@ */ -#include "planar_functions.h" +#include "libyuv/planar_functions.h" #include -#include "cpu_id.h" +#include "libyuv/cpu_id.h" #include "row.h" namespace libyuv { @@ -38,19 +38,29 @@ static void SplitUV_NEON(const uint8* src_uv, ); } -#elif (defined(WIN32) || defined(__i386__)) && !defined(COVERAGE_ENABLED) && \ - !defined(__PIC__) && !TARGET_IPHONE_SIMULATOR +#elif (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \ + && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) #if defined(_MSC_VER) #define TALIGN16(t, var) static __declspec(align(16)) t _ ## var -#elif defined(OSX) -#define TALIGN16(t, var) t var __attribute__((aligned(16))) #else -#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16))) +#define TALIGN16(t, var) t var __attribute__((aligned(16))) #endif -// shuffle constant to put even bytes in low 8 and odd bytes in high 8 bytes -extern "C" TALIGN16(const uint8, shufevenodd[16]) = - { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; +// Shuffle table for converting ABGR to ARGB. +extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = + { 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u }; + +// Shuffle table for converting BGRA to ARGB. +extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = + { 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u }; + +// Shuffle table for converting BG24 to ARGB. +extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = + { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u }; + +// Shuffle table for converting RAW to ARGB. +extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = + { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u }; #if defined(WIN32) && !defined(COVERAGE_ENABLED) #define HAS_SPLITUV_SSE2 @@ -89,118 +99,40 @@ static void SplitUV_SSE2(const uint8* src_uv, } } -#define HAS_SPLITUV_SSSE3 -__declspec(naked) -static void SplitUV_SSSE3(const uint8* src_uv, - uint8* dst_u, uint8* dst_v, int pix) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - movdqa xmm7, _shufevenodd - - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - pshufb xmm0, xmm7 // 8 u's and 8 v's - pshufb xmm1, xmm7 // 8 u's and 8 v's - movdqa xmm2, xmm0 - punpcklqdq xmm0, xmm1 // 16 u's - punpckhqdq xmm2, xmm1 // 16 v's - movdqa [edx], xmm0 - lea edx, [edx + 16] - movdqa [edi], xmm2 - lea edi, [edi + 16] - sub ecx, 16 - ja wloop - pop edi - ret - } -} -#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \ - !TARGET_IPHONE_SIMULATOR +#elif (defined(__x86_64__) || defined(__i386__)) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) #define HAS_SPLITUV_SSE2 -extern "C" void SplitUV_SSE2(const uint8* src_uv, - uint8* dst_u, uint8* dst_v, int pix); +static void SplitUV_SSE2(const uint8* src_uv, + uint8* dst_u, uint8* dst_v, int pix) { asm( - ".text\n" -#if defined(OSX) - ".globl _SplitUV_SSE2\n" -"_SplitUV_SSE2:\n" -#else - ".global SplitUV_SSE2\n" -"SplitUV_SSE2:\n" -#endif - "push %edi\n" - "mov 0x8(%esp),%eax\n" - "mov 0xc(%esp),%edx\n" - "mov 0x10(%esp),%edi\n" - "mov 0x14(%esp),%ecx\n" - "pcmpeqb %xmm7,%xmm7\n" - "psrlw $0x8,%xmm7\n" - + "pcmpeqb %%xmm7,%%xmm7\n" + "psrlw $0x8,%%xmm7\n" "1:" - "movdqa (%eax),%xmm0\n" - "movdqa 0x10(%eax),%xmm1\n" - "lea 0x20(%eax),%eax\n" - "movdqa %xmm0,%xmm2\n" - "movdqa %xmm1,%xmm3\n" - "pand %xmm7,%xmm0\n" - "pand %xmm7,%xmm1\n" - "packuswb %xmm1,%xmm0\n" - "movdqa %xmm0,(%edx)\n" - "lea 0x10(%edx),%edx\n" - "psrlw $0x8,%xmm2\n" - "psrlw $0x8,%xmm3\n" - "packuswb %xmm3,%xmm2\n" - "movdqa %xmm2,(%edi)\n" - "lea 0x10(%edi),%edi\n" - "sub $0x10,%ecx\n" - "ja 1b\n" - "pop %edi\n" - "ret\n" -); - -#define HAS_SPLITUV_SSSE3 -extern "C" void SplitUV_SSSE3(const uint8* src_uv, - uint8* dst_u, uint8* dst_v, int pix); - asm( - ".text\n" -#if defined(OSX) - ".globl _SplitUV_SSSE3\n" -"_SplitUV_SSSE3:\n" -#else - ".global SplitUV_SSSE3\n" -"SplitUV_SSSE3:\n" -#endif - "push %edi\n" - "mov 0x8(%esp),%eax\n" - "mov 0xc(%esp),%edx\n" - "mov 0x10(%esp),%edi\n" - "mov 0x14(%esp),%ecx\n" - "movdqa _shufevenodd,%xmm7\n" - -"1:" - "movdqa (%eax),%xmm0\n" - "movdqa 0x10(%eax),%xmm1\n" - "lea 0x20(%eax),%eax\n" - "pshufb %xmm7,%xmm0\n" - "pshufb %xmm7,%xmm1\n" - "movdqa %xmm0,%xmm2\n" - "punpcklqdq %xmm1,%xmm0\n" - "punpckhqdq %xmm1,%xmm2\n" - "movdqa %xmm0,(%edx)\n" - "lea 0x10(%edx),%edx\n" - "movdqa %xmm2,(%edi)\n" - "lea 0x10(%edi),%edi\n" - "sub $0x10,%ecx\n" - "ja 1b\n" - "pop %edi\n" - "ret\n" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "lea 0x20(%0),%0\n" + "movdqa %%xmm0,%%xmm2\n" + "movdqa %%xmm1,%%xmm3\n" + "pand %%xmm7,%%xmm0\n" + "pand %%xmm7,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "movdqa %%xmm0,(%1)\n" + "lea 0x10(%1),%1\n" + "psrlw $0x8,%%xmm2\n" + "psrlw $0x8,%%xmm3\n" + "packuswb %%xmm3,%%xmm2\n" + "movdqa %%xmm2,(%2)\n" + "lea 0x10(%2),%2\n" + "sub $0x10,%3\n" + "ja 1b\n" + : + : "r"(src_uv), // %0 + "r"(dst_u), // %1 + "r"(dst_v), // %2 + "r"(pix) // %3 + : "memory" ); +} #endif #endif @@ -216,28 +148,28 @@ static void SplitUV_C(const uint8* src_uv, } } -static void I420CopyPlane(const uint8* src_y, int src_pitch_y, - uint8* dst_y, int dst_pitch_y, +static void I420CopyPlane(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, int width, int height) { // Copy plane for (int y = 0; y < height; ++y) { memcpy(dst_y, src_y, width); - src_y += src_pitch_y; - dst_y += dst_pitch_y; + src_y += src_stride_y; + dst_y += dst_stride_y; } } -static void I420CopyPlane2(const uint8* src, int src_pitch_0, int src_pitch_1, - uint8* dst, int dst_pitch, +static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, + uint8* dst, int dst_stride, int width, int height) { // Copy plane for (int y = 0; y < height; y += 2) { memcpy(dst, src, width); - src += src_pitch_0; - dst += dst_pitch; + src += src_stride_0; + dst += dst_stride; memcpy(dst, src, width); - src += src_pitch_1; - dst += dst_pitch; + src += src_stride_1; + dst += dst_stride; } } @@ -249,81 +181,83 @@ static void I420CopyPlane2(const uint8* src, int src_pitch_0, int src_pitch_1, // Helper function to copy yuv data without scaling. Used // by our jpeg conversion callbacks to incrementally fill a yuv image. -void I420Copy(const uint8* src_y, int src_pitch_y, - const uint8* src_u, int src_pitch_u, - const uint8* src_v, int src_pitch_v, - uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - int width, int height) { +int I420Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { // Negative height means invert the image. if (height < 0) { height = -height; - src_y = src_y + (height - 1) * src_pitch_y; - src_u = src_u + (height - 1) * src_pitch_u; - src_v = src_v + (height - 1) * src_pitch_v; - src_pitch_y = -src_pitch_y; - src_pitch_u = -src_pitch_u; - src_pitch_v = -src_pitch_v; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; } int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - I420CopyPlane(src_y, src_pitch_y, dst_y, dst_pitch_y, width, height); - I420CopyPlane(src_u, src_pitch_u, dst_u, dst_pitch_u, halfwidth, halfheight); - I420CopyPlane(src_v, src_pitch_v, dst_v, dst_pitch_v, halfwidth, halfheight); + I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + I420CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + I420CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; } // Helper function to copy yuv data without scaling. Used // by our jpeg conversion callbacks to incrementally fill a yuv image. -void I422ToI420(const uint8* src_y, int src_pitch_y, - const uint8* src_u, int src_pitch_u, - const uint8* src_v, int src_pitch_v, - uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - int width, int height) { +int I422ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { // Negative height means invert the image. if (height < 0) { height = -height; - src_y = src_y + (height - 1) * src_pitch_y; - src_u = src_u + (height - 1) * src_pitch_u; - src_v = src_v + (height - 1) * src_pitch_v; - src_pitch_y = -src_pitch_y; - src_pitch_u = -src_pitch_u; - src_pitch_v = -src_pitch_v; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; } // Copy Y plane - I420CopyPlane(src_y, src_pitch_y, dst_y, dst_pitch_y, width, height); + I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); // SubSample UV planes. int x, y; int halfwidth = (width + 1) >> 1; for (y = 0; y < height; y += 2) { const uint8* u0 = src_u; - const uint8* u1 = src_u + src_pitch_u; + const uint8* u1 = src_u + src_stride_u; if ((y + 1) >= height) { u1 = u0; } for (x = 0; x < halfwidth; ++x) { dst_u[x] = (u0[x] + u1[x] + 1) >> 1; } - src_u += src_pitch_u * 2; - dst_u += dst_pitch_u; + src_u += src_stride_u * 2; + dst_u += dst_stride_u; } for (y = 0; y < height; y += 2) { const uint8* v0 = src_v; - const uint8* v1 = src_v + src_pitch_v; + const uint8* v1 = src_v + src_stride_v; if ((y + 1) >= height) { v1 = v0; } for (x = 0; x < halfwidth; ++x) { dst_v[x] = (v0[x] + v1[x] + 1) >> 1; } - src_v += src_pitch_v * 2; - dst_v += dst_pitch_v; + src_v += src_stride_v * 2; + dst_v += dst_stride_v; } + return 0; } // Support converting from FOURCC_M420 @@ -332,26 +266,26 @@ void I422ToI420(const uint8* src_y, int src_pitch_y, // M420 format description: // M420 is row biplanar 420: 2 rows of Y and 1 row of VU. // Chroma is half width / half height. (420) -// src_pitch_m420 is row planar. Normally this will be the width in pixels. -// The UV plane is half width, but 2 values, so src_pitch_m420 applies to this +// src_stride_m420 is row planar. Normally this will be the width in pixels. +// The UV plane is half width, but 2 values, so src_stride_m420 applies to this // as well as the two Y planes. // TODO(fbarchard): Do NV21/NV12 formats with this function -static void X420ToI420(const uint8* src_y, - int src_pitch_y0, int src_pitch_y1, - const uint8* src_uv, int src_pitch_uv, - uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - int width, int height) { +static int X420ToI420(const uint8* src_y, + int src_stride_y0, int src_stride_y1, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { // Negative height means invert the image. if (height < 0) { height = -height; - dst_y = dst_y + (height - 1) * dst_pitch_y; - dst_u = dst_u + (height - 1) * dst_pitch_u; - dst_v = dst_v + (height - 1) * dst_pitch_v; - dst_pitch_y = -dst_pitch_y; - dst_pitch_u = -dst_pitch_u; - dst_pitch_v = -dst_pitch_v; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (height - 1) * dst_stride_u; + dst_v = dst_v + (height - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; } int halfwidth = (width + 1) >> 1; @@ -359,25 +293,17 @@ static void X420ToI420(const uint8* src_y, #if defined(HAS_SPLITUV_NEON) if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && (halfwidth % 16 == 0) && - IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) && - IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) && - IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) { + IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) && + IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) && + IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) { SplitUV = SplitUV_NEON; } else -#elif defined(HAS_SPLITUV_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (halfwidth % 16 == 0) && - IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) && - IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) && - IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) { - SplitUV = SplitUV_SSSE3; - } else #elif defined(HAS_SPLITUV_SSE2) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && (halfwidth % 16 == 0) && - IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) && - IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) && - IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) { + IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) && + IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) && + IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) { SplitUV = SplitUV_SSE2; } else #endif @@ -385,43 +311,48 @@ static void X420ToI420(const uint8* src_y, SplitUV = SplitUV_C; } - I420CopyPlane2(src_y, src_pitch_y0, src_pitch_y1, dst_y, dst_pitch_y, + I420CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y, width, height); int halfheight = (height + 1) >> 1; for (int y = 0; y < halfheight; ++y) { // Copy a row of UV. SplitUV(src_uv, dst_u, dst_v, halfwidth); - dst_u += dst_pitch_u; - dst_v += dst_pitch_v; - src_uv += src_pitch_uv; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + src_uv += src_stride_uv; } + return 0; } // Convert M420 to I420. -void M420ToI420(const uint8* src_m420, int src_pitch_m420, - uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - int width, int height) { - X420ToI420(src_m420, src_pitch_m420, src_pitch_m420 * 2, - src_m420 + src_pitch_m420 * 2, src_pitch_m420 * 3, - dst_y, dst_pitch_y, dst_u, dst_pitch_u, dst_v, dst_pitch_v, - width, height); +int M420ToI420(const uint8* src_m420, int src_stride_m420, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2, + src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); } // Convert NV12 to I420. -void NV12ToI420(const uint8* src_y, - const uint8* src_uv, - int src_pitch, - uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - int width, int height) { - X420ToI420(src_y, src_pitch, src_pitch, - src_uv, src_pitch, - dst_y, dst_pitch_y, dst_u, dst_pitch_u, dst_v, dst_pitch_v, - width, height); +int NV12ToI420(const uint8* src_y, + const uint8* src_uv, + int src_stride, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return X420ToI420(src_y, src_stride, src_stride, + src_uv, src_stride, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); } #if defined(WIN32) && !defined(COVERAGE_ENABLED) @@ -471,59 +402,48 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, ret } } -#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \ - !TARGET_IPHONE_SIMULATOR +#elif (defined(__x86_64__) || defined(__i386__)) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) #define HAS_SPLITYUY2_SSE2 -extern "C" void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, - uint8* dst_u, uint8* dst_v, int pix); +static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, + uint8* dst_u, uint8* dst_v, int pix) { asm( - ".text\n" -#if defined(OSX) - ".globl _SplitYUY2_SSE2\n" -"_SplitYUY2_SSE2:\n" -#else - ".global SplitYUY2_SSE2\n" -"SplitYUY2_SSE2:\n" -#endif - "push %esi\n" - "push %edi\n" - "mov 0xc(%esp),%eax\n" - "mov 0x10(%esp),%edx\n" - "mov 0x14(%esp),%esi\n" - "mov 0x18(%esp),%edi\n" - "mov 0x1c(%esp),%ecx\n" - "pcmpeqb %xmm7,%xmm7\n" - "psrlw $0x8,%xmm7\n" - + "pcmpeqb %%xmm7,%%xmm7\n" + "psrlw $0x8,%%xmm7\n" "1:" - "movdqa (%eax),%xmm0\n" - "movdqa 0x10(%eax),%xmm1\n" - "lea 0x20(%eax),%eax\n" - "movdqa %xmm0,%xmm2\n" - "movdqa %xmm1,%xmm3\n" - "pand %xmm7,%xmm2\n" - "pand %xmm7,%xmm3\n" - "packuswb %xmm3,%xmm2\n" - "movdqa %xmm2,(%edx)\n" - "lea 0x10(%edx),%edx\n" - "psrlw $0x8,%xmm0\n" - "psrlw $0x8,%xmm1\n" - "packuswb %xmm1,%xmm0\n" - "movdqa %xmm0,%xmm1\n" - "pand %xmm7,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movq %xmm0,(%esi)\n" - "lea 0x8(%esi),%esi\n" - "psrlw $0x8,%xmm1\n" - "packuswb %xmm1,%xmm1\n" - "movq %xmm1,(%edi)\n" - "lea 0x8(%edi),%edi\n" - "sub $0x10,%ecx\n" - "ja 1b\n" - "pop %edi\n" - "pop %esi\n" - "ret\n" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "lea 0x20(%0),%0\n" + "movdqa %%xmm0,%%xmm2\n" + "movdqa %%xmm1,%%xmm3\n" + "pand %%xmm7,%%xmm2\n" + "pand %%xmm7,%%xmm3\n" + "packuswb %%xmm3,%%xmm2\n" + "movdqa %%xmm2,(%1)\n" + "lea 0x10(%1),%1\n" + "psrlw $0x8,%%xmm0\n" + "psrlw $0x8,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "movdqa %%xmm0,%%xmm1\n" + "pand %%xmm7,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "movq %%xmm0,(%2)\n" + "lea 0x8(%2),%2\n" + "psrlw $0x8,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movq %%xmm1,(%3)\n" + "lea 0x8(%3),%3\n" + "sub $0x10,%4\n" + "ja 1b\n" + : + : "r"(src_yuy2), // %0 + "r"(dst_y), // %1 + "r"(dst_u), // %2 + "r"(dst_v), // %3 + "r"(pix) // %4 + : "memory" ); +} #endif static void SplitYUY2_C(const uint8* src_yuy2, @@ -543,21 +463,21 @@ static void SplitYUY2_C(const uint8* src_yuy2, // Convert Q420 to I420. // Format is rows of YY/YUYV -void Q420ToI420(const uint8* src_y, int src_pitch_y, - const uint8* src_yuy2, int src_pitch_yuy2, - uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - int width, int height) { +int Q420ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { void (*SplitYUY2)(const uint8* src_yuy2, uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix); #if defined(HAS_SPLITYUY2_SSE2) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && (width % 16 == 0) && - IS_ALIGNED(src_yuy2, 16) && (src_pitch_yuy2 % 16 == 0) && - IS_ALIGNED(dst_y, 16) && (dst_pitch_y % 16 == 0) && - IS_ALIGNED(dst_u, 8) && (dst_pitch_u % 8 == 0) && - IS_ALIGNED(dst_v, 8) && (dst_pitch_v % 8 == 0)) { + IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { SplitYUY2 = SplitYUY2_SSE2; } else #endif @@ -566,16 +486,17 @@ void Q420ToI420(const uint8* src_y, int src_pitch_y, } for (int y = 0; y < height; y += 2) { memcpy(dst_y, src_y, width); - dst_y += dst_pitch_y; - src_y += src_pitch_y; + dst_y += dst_stride_y; + src_y += src_stride_y; // Copy a row of YUY2. SplitYUY2(src_yuy2, dst_y, dst_u, dst_v, width); - dst_y += dst_pitch_y; - dst_u += dst_pitch_u; - dst_v += dst_pitch_v; - src_yuy2 += src_pitch_yuy2; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + src_yuy2 += src_stride_yuy2; } + return 0; } #if defined(WIN32) && !defined(COVERAGE_ENABLED) @@ -606,13 +527,13 @@ void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, } __declspec(naked) -void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int pitch_yuy2, +void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_y, int pix) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // pitch_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix @@ -673,13 +594,13 @@ void UYVYToI420RowY_SSE2(const uint8* src_uyvy, } __declspec(naked) -void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int pitch_uyvy, +void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_y, int pix) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // pitch_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix @@ -714,174 +635,138 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int pitch_uyvy, ret } } -#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \ - !TARGET_IPHONE_SIMULATOR + +#elif (defined(__x86_64__) || defined(__i386__)) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) #define HAS_YUY2TOI420ROW_SSE2 -extern "C" void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, - uint8* dst_y, int pix); +static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix) { asm( - ".text\n" -#if defined(OSX) - ".globl _YUY2ToI420RowY_SSE2\n" -"_YUY2ToI420RowY_SSE2:\n" -#else - ".global YUY2ToI420RowY_SSE2\n" -"YUY2ToI420RowY_SSE2:\n" -#endif - "mov 0x4(%esp),%eax\n" - "mov 0x8(%esp),%edx\n" - "mov 0xc(%esp),%ecx\n" - "pcmpeqb %xmm7,%xmm7\n" - "psrlw $0x8,%xmm7\n" - + "pcmpeqb %%xmm7,%%xmm7\n" + "psrlw $0x8,%%xmm7\n" "1:" - "movdqa (%eax),%xmm0\n" - "movdqa 0x10(%eax),%xmm1\n" - "lea 0x20(%eax),%eax\n" - "pand %xmm7,%xmm0\n" - "pand %xmm7,%xmm1\n" - "packuswb %xmm1,%xmm0\n" - "movdqa %xmm0,(%edx)\n" - "lea 0x10(%edx),%edx\n" - "sub $0x10,%ecx\n" - "ja 1b\n" - "ret\n" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "lea 0x20(%0),%0\n" + "pand %%xmm7,%%xmm0\n" + "pand %%xmm7,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "movdqa %%xmm0,(%1)\n" + "lea 0x10(%1),%1\n" + "sub $0x10,%2\n" + "ja 1b\n" + : + : "r"(src_yuy2), // %0 + "r"(dst_y), // %1 + "r"(pix) // %2 + : "memory" ); +} -extern "C" void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int pitch_yuy2, - uint8* dst_u, uint8* dst_y, int pix); +static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_y, int pix) { asm( - ".text\n" -#if defined(OSX) - ".globl _YUY2ToI420RowUV_SSE2\n" -"_YUY2ToI420RowUV_SSE2:\n" -#else - ".global YUY2ToI420RowUV_SSE2\n" -"YUY2ToI420RowUV_SSE2:\n" -#endif - "push %esi\n" - "push %edi\n" - "mov 0xc(%esp),%eax\n" - "mov 0x10(%esp),%esi\n" - "mov 0x14(%esp),%edx\n" - "mov 0x18(%esp),%edi\n" - "mov 0x1c(%esp),%ecx\n" - "pcmpeqb %xmm7,%xmm7\n" - "psrlw $0x8,%xmm7\n" - + "pcmpeqb %%xmm7,%%xmm7\n" + "psrlw $0x8,%%xmm7\n" "1:" - "movdqa (%eax),%xmm0\n" - "movdqa 0x10(%eax),%xmm1\n" - "movdqa (%eax,%esi,1),%xmm2\n" - "movdqa 0x10(%eax,%esi,1),%xmm3\n" - "lea 0x20(%eax),%eax\n" - "pavgb %xmm2,%xmm0\n" - "pavgb %xmm3,%xmm1\n" - "psrlw $0x8,%xmm0\n" - "psrlw $0x8,%xmm1\n" - "packuswb %xmm1,%xmm0\n" - "movdqa %xmm0,%xmm1\n" - "pand %xmm7,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movq %xmm0,(%edx)\n" - "lea 0x8(%edx),%edx\n" - "psrlw $0x8,%xmm1\n" - "packuswb %xmm1,%xmm1\n" - "movq %xmm1,(%edi)\n" - "lea 0x8(%edi),%edi\n" - "sub $0x10,%ecx\n" - "ja 1b\n" - "pop %edi\n" - "pop %esi\n" - "ret\n" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "movdqa (%0,%1,1),%%xmm2\n" + "movdqa 0x10(%0,%1,1),%%xmm3\n" + "lea 0x20(%0),%0\n" + "pavgb %%xmm2,%%xmm0\n" + "pavgb %%xmm3,%%xmm1\n" + "psrlw $0x8,%%xmm0\n" + "psrlw $0x8,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "movdqa %%xmm0,%%xmm1\n" + "pand %%xmm7,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "movq %%xmm0,(%2)\n" + "lea 0x8(%2),%2\n" + "psrlw $0x8,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movq %%xmm1,(%3)\n" + "lea 0x8(%3),%3\n" + "sub $0x10,%4\n" + "ja 1b\n" + : + : "r"(src_yuy2), // %0 + "r"((intptr_t)stride_yuy2), // %1 + "r"(dst_u), // %2 + "r"(dst_y), // %3 + "r"(pix) // %4 + : "memory" ); - +} #define HAS_UYVYTOI420ROW_SSE2 -extern "C" void UYVYToI420RowY_SSE2(const uint8* src_uyvy, - uint8* dst_y, int pix); +static void UYVYToI420RowY_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix) { asm( - ".text\n" -#if defined(OSX) - ".globl _UYVYToI420RowY_SSE2\n" -"_UYVYToI420RowY_SSE2:\n" -#else - ".global UYVYToI420RowY_SSE2\n" -"UYVYToI420RowY_SSE2:\n" -#endif - "mov 0x4(%esp),%eax\n" - "mov 0x8(%esp),%edx\n" - "mov 0xc(%esp),%ecx\n" - "1:" - "movdqa (%eax),%xmm0\n" - "movdqa 0x10(%eax),%xmm1\n" - "lea 0x20(%eax),%eax\n" - "psrlw $0x8,%xmm0\n" - "psrlw $0x8,%xmm1\n" - "packuswb %xmm1,%xmm0\n" - "movdqa %xmm0,(%edx)\n" - "lea 0x10(%edx),%edx\n" - "sub $0x10,%ecx\n" - "ja 1b\n" - "ret\n" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "lea 0x20(%0),%0\n" + "psrlw $0x8,%%xmm0\n" + "psrlw $0x8,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "movdqa %%xmm0,(%1)\n" + "lea 0x10(%1),%1\n" + "sub $0x10,%2\n" + "ja 1b\n" + : + : "r"(src_uyvy), // %0 + "r"(dst_y), // %1 + "r"(pix) // %2 + : "memory" ); +} -extern "C" void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int pitch_uyvy, - uint8* dst_u, uint8* dst_y, int pix); +static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_y, int pix) { asm( - ".text\n" -#if defined(OSX) - ".globl _UYVYToI420RowUV_SSE2\n" -"_UYVYToI420RowUV_SSE2:\n" -#else - ".global UYVYToI420RowUV_SSE2\n" -"UYVYToI420RowUV_SSE2:\n" -#endif - "push %esi\n" - "push %edi\n" - "mov 0xc(%esp),%eax\n" - "mov 0x10(%esp),%esi\n" - "mov 0x14(%esp),%edx\n" - "mov 0x18(%esp),%edi\n" - "mov 0x1c(%esp),%ecx\n" - "pcmpeqb %xmm7,%xmm7\n" - "psrlw $0x8,%xmm7\n" - + "pcmpeqb %%xmm7,%%xmm7\n" + "psrlw $0x8,%%xmm7\n" "1:" - "movdqa (%eax),%xmm0\n" - "movdqa 0x10(%eax),%xmm1\n" - "movdqa (%eax,%esi,1),%xmm2\n" - "movdqa 0x10(%eax,%esi,1),%xmm3\n" - "lea 0x20(%eax),%eax\n" - "pavgb %xmm2,%xmm0\n" - "pavgb %xmm3,%xmm1\n" - "pand %xmm7,%xmm0\n" - "pand %xmm7,%xmm1\n" - "packuswb %xmm1,%xmm0\n" - "movdqa %xmm0,%xmm1\n" - "pand %xmm7,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movq %xmm0,(%edx)\n" - "lea 0x8(%edx),%edx\n" - "psrlw $0x8,%xmm1\n" - "packuswb %xmm1,%xmm1\n" - "movq %xmm1,(%edi)\n" - "lea 0x8(%edi),%edi\n" - "sub $0x10,%ecx\n" - "ja 1b\n" - "pop %edi\n" - "pop %esi\n" - "ret\n" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "movdqa (%0,%1,1),%%xmm2\n" + "movdqa 0x10(%0,%1,1),%%xmm3\n" + "lea 0x20(%0),%0\n" + "pavgb %%xmm2,%%xmm0\n" + "pavgb %%xmm3,%%xmm1\n" + "pand %%xmm7,%%xmm0\n" + "pand %%xmm7,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "movdqa %%xmm0,%%xmm1\n" + "pand %%xmm7,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "movq %%xmm0,(%2)\n" + "lea 0x8(%2),%2\n" + "psrlw $0x8,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movq %%xmm1,(%3)\n" + "lea 0x8(%3),%3\n" + "sub $0x10,%4\n" + "ja 1b\n" + : + : "r"(src_uyvy), // %0 + "r"((intptr_t)stride_uyvy), // %1 + "r"(dst_u), // %2 + "r"(dst_y), // %3 + "r"(pix) // %4 + : "memory" ); +} #endif -void YUY2ToI420RowUV_C(const uint8* src_yuy2, int src_pitch_yuy2, +void YUY2ToI420RowUV_C(const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_u, uint8* dst_v, int pix) { // Copy a row of yuy2 UV values for (int x = 0; x < pix; x += 2) { - dst_u[0] = (src_yuy2[1] + src_yuy2[src_pitch_yuy2 + 1] + 1) >> 1; - dst_v[0] = (src_yuy2[3] + src_yuy2[src_pitch_yuy2 + 3] + 1) >> 1; + dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; + dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; src_yuy2 += 4; dst_u += 1; dst_v += 1; @@ -898,12 +783,12 @@ void YUY2ToI420RowY_C(const uint8* src_yuy2, } } -void UYVYToI420RowUV_C(const uint8* src_uyvy, int src_pitch_uyvy, +void UYVYToI420RowUV_C(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_u, uint8* dst_v, int pix) { // Copy a row of uyvy UV values for (int x = 0; x < pix; x += 2) { - dst_u[0] = (src_uyvy[0] + src_uyvy[src_pitch_uyvy + 0] + 1) >> 1; - dst_v[0] = (src_uyvy[2] + src_uyvy[src_pitch_uyvy + 2] + 1) >> 1; + dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1; + dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1; src_uyvy += 4; dst_u += 1; dst_v += 1; @@ -921,22 +806,22 @@ void UYVYToI420RowY_C(const uint8* src_uyvy, } // Convert YUY2 to I420. -void YUY2ToI420(const uint8* src_yuy2, int src_pitch_yuy2, - uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - int width, int height) { - void (*YUY2ToI420RowUV)(const uint8* src_yuy2, int src_pitch_yuy2, +int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + void (*YUY2ToI420RowUV)(const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_u, uint8* dst_v, int pix); void (*YUY2ToI420RowY)(const uint8* src_yuy2, uint8* dst_y, int pix); #if defined(HAS_YUY2TOI420ROW_SSE2) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && (width % 16 == 0) && - IS_ALIGNED(src_yuy2, 16) && (src_pitch_yuy2 % 16 == 0) && - IS_ALIGNED(dst_y, 16) && (dst_pitch_y % 16 == 0) && - IS_ALIGNED(dst_u, 8) && (dst_pitch_u % 8 == 0) && - IS_ALIGNED(dst_v, 8) && (dst_pitch_v % 8 == 0)) { + IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { YUY2ToI420RowY = YUY2ToI420RowY_SSE2; YUY2ToI420RowUV = YUY2ToI420RowUV_SSE2; } else @@ -948,35 +833,36 @@ void YUY2ToI420(const uint8* src_yuy2, int src_pitch_yuy2, for (int y = 0; y < height; ++y) { if ((y & 1) == 0) { if (y >= (height - 1) ) { // last chroma on odd height clamp height - src_pitch_yuy2 = 0; + src_stride_yuy2 = 0; } - YUY2ToI420RowUV(src_yuy2, src_pitch_yuy2, dst_u, dst_v, width); - dst_u += dst_pitch_u; - dst_v += dst_pitch_v; + YUY2ToI420RowUV(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); + dst_u += dst_stride_u; + dst_v += dst_stride_v; } YUY2ToI420RowY(src_yuy2, dst_y, width); - dst_y += dst_pitch_y; - src_yuy2 += src_pitch_yuy2; + dst_y += dst_stride_y; + src_yuy2 += src_stride_yuy2; } + return 0; } // Convert UYVY to I420. -void UYVYToI420(const uint8* src_uyvy, int src_pitch_uyvy, - uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - int width, int height) { - void (*UYVYToI420RowUV)(const uint8* src_uyvy, int src_pitch_uyvy, +int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + void (*UYVYToI420RowUV)(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_u, uint8* dst_v, int pix); void (*UYVYToI420RowY)(const uint8* src_uyvy, uint8* dst_y, int pix); #if defined(HAS_UYVYTOI420ROW_SSE2) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && (width % 16 == 0) && - IS_ALIGNED(src_uyvy, 16) && (src_pitch_uyvy % 16 == 0) && - IS_ALIGNED(dst_y, 16) && (dst_pitch_y % 16 == 0) && - IS_ALIGNED(dst_u, 8) && (dst_pitch_u % 8 == 0) && - IS_ALIGNED(dst_v, 8) && (dst_pitch_v % 8 == 0)) { + IS_ALIGNED(src_uyvy, 16) && (src_stride_uyvy % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { UYVYToI420RowY = UYVYToI420RowY_SSE2; UYVYToI420RowUV = UYVYToI420RowUV_SSE2; } else @@ -988,119 +874,126 @@ void UYVYToI420(const uint8* src_uyvy, int src_pitch_uyvy, for (int y = 0; y < height; ++y) { if ((y & 1) == 0) { if (y >= (height - 1) ) { // last chroma on odd height clamp height - src_pitch_uyvy = 0; + src_stride_uyvy = 0; } - UYVYToI420RowUV(src_uyvy, src_pitch_uyvy, dst_u, dst_v, width); - dst_u += dst_pitch_u; - dst_v += dst_pitch_v; + UYVYToI420RowUV(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); + dst_u += dst_stride_u; + dst_v += dst_stride_v; } UYVYToI420RowY(src_uyvy, dst_y, width); - dst_y += dst_pitch_y; - src_uyvy += src_pitch_uyvy; + dst_y += dst_stride_y; + src_uyvy += src_stride_uyvy; } + return 0; } // Convert I420 to ARGB. -// TODO(fbarchard): Add SSSE3 version and supply C version for fallback. -void I420ToARGB(const uint8* src_y, int src_pitch_y, - const uint8* src_u, int src_pitch_u, - const uint8* src_v, int src_pitch_v, - uint8* dst_argb, int dst_pitch_argb, - int width, int height) { +// TODO(fbarchard): Add SSE2 version and supply C version for fallback. +int I420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { for (int y = 0; y < height; ++y) { FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width); - dst_argb += dst_pitch_argb; - src_y += src_pitch_y; + dst_argb += dst_stride_argb; + src_y += src_stride_y; if (y & 1) { - src_u += src_pitch_u; - src_v += src_pitch_v; + src_u += src_stride_u; + src_v += src_stride_v; } } // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. EMMS(); + return 0; } // Convert I420 to BGRA. -void I420ToBGRA(const uint8* src_y, int src_pitch_y, - const uint8* src_u, int src_pitch_u, - const uint8* src_v, int src_pitch_v, - uint8* dst_argb, int dst_pitch_argb, - int width, int height) { +int I420ToBGRA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { for (int y = 0; y < height; ++y) { FastConvertYUVToBGRARow(src_y, src_u, src_v, dst_argb, width); - dst_argb += dst_pitch_argb; - src_y += src_pitch_y; + dst_argb += dst_stride_argb; + src_y += src_stride_y; if (y & 1) { - src_u += src_pitch_u; - src_v += src_pitch_v; + src_u += src_stride_u; + src_v += src_stride_v; } } EMMS(); + return 0; } // Convert I420 to BGRA. -void I420ToABGR(const uint8* src_y, int src_pitch_y, - const uint8* src_u, int src_pitch_u, - const uint8* src_v, int src_pitch_v, - uint8* dst_argb, int dst_pitch_argb, - int width, int height) { +int I420ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { for (int y = 0; y < height; ++y) { FastConvertYUVToABGRRow(src_y, src_u, src_v, dst_argb, width); - dst_argb += dst_pitch_argb; - src_y += src_pitch_y; + dst_argb += dst_stride_argb; + src_y += src_stride_y; if (y & 1) { - src_u += src_pitch_u; - src_v += src_pitch_v; + src_u += src_stride_u; + src_v += src_stride_v; } } EMMS(); + return 0; } // Convert I422 to ARGB. -void I422ToARGB(const uint8* src_y, int src_pitch_y, - const uint8* src_u, int src_pitch_u, - const uint8* src_v, int src_pitch_v, - uint8* dst_argb, int dst_pitch_argb, - int width, int height) { +int I422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { for (int y = 0; y < height; ++y) { FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width); - dst_argb += dst_pitch_argb; - src_y += src_pitch_y; - src_u += src_pitch_u; - src_v += src_pitch_v; + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; } // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. EMMS(); + return 0; } // Convert I444 to ARGB. -void I444ToARGB(const uint8* src_y, int src_pitch_y, - const uint8* src_u, int src_pitch_u, - const uint8* src_v, int src_pitch_v, - uint8* dst_argb, int dst_pitch_argb, - int width, int height) { +int I444ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { for (int y = 0; y < height; ++y) { FastConvertYUV444ToRGB32Row(src_y, src_u, src_v, dst_argb, width); - dst_argb += dst_pitch_argb; - src_y += src_pitch_y; - src_u += src_pitch_u; - src_v += src_pitch_v; + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; } // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. EMMS(); + return 0; } // Convert I400 to ARGB. -void I400ToARGB_Reference(const uint8* src_y, int src_pitch_y, - uint8* dst_argb, int dst_pitch_argb, - int width, int height) { +int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { for (int y = 0; y < height; ++y) { FastConvertYToRGB32Row(src_y, dst_argb, width); - dst_argb += dst_pitch_argb; - src_y += src_pitch_y; + dst_argb += dst_stride_argb; + src_y += src_stride_y; } // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. EMMS(); + return 0; } // TODO(fbarchard): 64 bit version @@ -1134,64 +1027,312 @@ static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { } } -#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \ - !TARGET_IPHONE_SIMULATOR +#define HAS_ABGRTOARGBROW_SSSE3 +__declspec(naked) +static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, + int pix) { +__asm { + mov eax, [esp + 4] // src_abgr + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + movdqa xmm7, byte ptr [_kShuffleMaskABGRToARGB] + convertloop : + movdqa xmm0, qword ptr [eax] + lea eax, [eax + 16] + pshufb xmm0, xmm7 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + ja convertloop + ret + } +} + +#define HAS_BGRATOARGBROW_SSSE3 +__declspec(naked) +static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, + int pix) { +__asm { + mov eax, [esp + 4] // src_bgra + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + movdqa xmm7, byte ptr [_kShuffleMaskBGRAToARGB] + + convertloop : + movdqa xmm0, qword ptr [eax] + lea eax, [eax + 16] + pshufb xmm0, xmm7 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + ja convertloop + ret + } +} + +#define HAS_BG24TOARGBROW_SSSE3 +__declspec(naked) +static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, + int pix) { +__asm { + mov eax, [esp + 4] // src_bg24 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm7, xmm7 // generate mask 0xff000000 + pslld xmm7, 24 + movdqa xmm6, byte ptr [_kShuffleMaskBG24ToARGB] + + convertloop : + movdqa xmm0, qword ptr [eax] + movdqa xmm1, qword ptr [eax + 16] + movdqa xmm3, qword ptr [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm6 + por xmm2, xmm7 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm6 + movdqa [edx + 32], xmm2 + por xmm0, xmm7 + pshufb xmm1, xmm6 + movdqa [edx], xmm0 + por xmm1, xmm7 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm6 + movdqa [edx + 16], xmm1 + por xmm3, xmm7 + movdqa [edx + 48], xmm3 + lea edx, [edx + 64] + sub ecx, 16 + ja convertloop + ret + } +} + +#define HAS_RAWTOARGBROW_SSSE3 +__declspec(naked) +static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, + int pix) { +__asm { + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm7, xmm7 // generate mask 0xff000000 + pslld xmm7, 24 + movdqa xmm6, byte ptr [_kShuffleMaskRAWToARGB] + + convertloop : + movdqa xmm0, qword ptr [eax] + movdqa xmm1, qword ptr [eax + 16] + movdqa xmm3, qword ptr [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm6 + por xmm2, xmm7 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm6 + movdqa [edx + 32], xmm2 + por xmm0, xmm7 + pshufb xmm1, xmm6 + movdqa [edx], xmm0 + por xmm1, xmm7 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm6 + movdqa [edx + 16], xmm1 + por xmm3, xmm7 + movdqa [edx + 48], xmm3 + lea edx, [edx + 64] + sub ecx, 16 + ja convertloop + ret + } +} + +#elif (defined(__x86_64__) || defined(__i386__)) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) + +// TODO(yuche): consider moving ARGB related codes to a separate file. #define HAS_I400TOARGBROW_SSE2 -extern "C" void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, - int pix); +static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { asm( - ".text\n" -#if defined(OSX) - ".globl _I400ToARGBRow_SSE2\n" -"_I400ToARGBRow_SSE2:\n" -#else - ".global I400ToARGBRow_SSE2\n" -"I400ToARGBRow_SSE2:\n" -#endif - "mov 0x4(%esp),%eax\n" - "mov 0x8(%esp),%edx\n" - "mov 0xc(%esp),%ecx\n" - "pcmpeqb %xmm7,%xmm7\n" - "pslld $0x18,%xmm7\n" + "pcmpeqb %%xmm7,%%xmm7\n" + "pslld $0x18,%%xmm7\n" "1:" - "movq (%eax),%xmm0\n" - "lea 0x8(%eax),%eax\n" - "punpcklbw %xmm0,%xmm0\n" - "movdqa %xmm0,%xmm1\n" - "punpcklwd %xmm0,%xmm0\n" - "punpckhwd %xmm1,%xmm1\n" - "por %xmm7,%xmm0\n" - "por %xmm7,%xmm1\n" - "movdqa %xmm0,(%edx)\n" - "movdqa %xmm1,0x10(%edx)\n" - "lea 0x20(%edx),%edx\n" - "sub $0x8,%ecx\n" - "ja 1b\n" - "ret\n" + "movq (%0),%%xmm0\n" + "lea 0x8(%0),%0\n" + "punpcklbw %%xmm0,%%xmm0\n" + "movdqa %%xmm0,%%xmm1\n" + "punpcklwd %%xmm0,%%xmm0\n" + "punpckhwd %%xmm1,%%xmm1\n" + "por %%xmm7,%%xmm0\n" + "por %%xmm7,%%xmm1\n" + "movdqa %%xmm0,(%1)\n" + "movdqa %%xmm1,0x10(%1)\n" + "lea 0x20(%1),%1\n" + "sub $0x8,%2\n" + "ja 1b\n" + : + : "r"(src_y), // %0 + "r"(dst_argb), // %1 + "r"(pix) // %2 + : "memory" ); +} + +#define HAS_ABGRTOARGBROW_SSSE3 +static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, + int pix) { + asm( + "movdqa (%3),%%xmm7\n" +"1:" + "movdqa (%0),%%xmm0\n" + "lea 0x10(%0),%0\n" + "pshufb %%xmm7,%%xmm0\n" + "movdqa %%xmm0,(%1)\n" + "lea 0x10(%1),%1\n" + "sub $0x4,%2\n" + "ja 1b\n" + : + : "r"(src_abgr), // %0 + "r"(dst_argb), // %1 + "r"(pix), // %2 + "r"(kShuffleMaskABGRToARGB) // %3 + : "memory" +); +} + +#define HAS_BGRATOARGBROW_SSSE3 +static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, + int pix) { + asm( + "movdqa (%3),%%xmm7\n" +"1:" + "movdqa (%0),%%xmm0\n" + "lea 0x10(%0),%0\n" + "pshufb %%xmm7,%%xmm0\n" + "movdqa %%xmm0,(%1)\n" + "lea 0x10(%1),%1\n" + "sub $0x4,%2\n" + "ja 1b\n" + : + : "r"(src_bgra), // %0 + "r"(dst_argb), // %1 + "r"(pix), // %2 + "r"(kShuffleMaskBGRAToARGB) // %3 + : "memory" +); +} + +#define HAS_BG24TOARGBROW_SSSE3 +static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, + int pix) { + asm( + "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000 + "pslld $0x18,%%xmm7\n" + "movdqa (%3),%%xmm6\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "movdqa 0x20(%0),%%xmm3\n" + "lea 0x30(%0),%0\n" + "movdqa %%xmm3,%%xmm2\n" + "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] } + "pshufb %%xmm6,%%xmm2\n" + "por %%xmm7,%%xmm2\n" + "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] } + "pshufb %%xmm6,%%xmm0\n" + "movdqa %%xmm2,0x20(%1)\n" + "por %%xmm7,%%xmm0\n" + "pshufb %%xmm6,%%xmm1\n" + "movdqa %%xmm0,(%1)\n" + "por %%xmm7,%%xmm1\n" + "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] } + "pshufb %%xmm6,%%xmm3\n" + "movdqa %%xmm1,0x10(%1)\n" + "por %%xmm7,%%xmm3\n" + "movdqa %%xmm3,0x30(%1)\n" + "lea 0x40(%1),%1\n" + "sub $0x10,%2\n" + "ja 1b\n" + : + : "r"(src_bg24), // %0 + "r"(dst_argb), // %1 + "r"(pix), // %2 + "r"(kShuffleMaskBG24ToARGB) // %3 + : "memory" +); +} + +#define HAS_RAWTOARGBROW_SSSE3 +static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, + int pix) { + asm( + "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000 + "pslld $0x18,%%xmm7\n" + "movdqa (%3),%%xmm6\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "movdqa 0x20(%0),%%xmm3\n" + "lea 0x30(%0),%0\n" + "movdqa %%xmm3,%%xmm2\n" + "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] } + "pshufb %%xmm6,%%xmm2\n" + "por %%xmm7,%%xmm2\n" + "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] } + "pshufb %%xmm6,%%xmm0\n" + "movdqa %%xmm2,0x20(%1)\n" + "por %%xmm7,%%xmm0\n" + "pshufb %%xmm6,%%xmm1\n" + "movdqa %%xmm0,(%1)\n" + "por %%xmm7,%%xmm1\n" + "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] } + "pshufb %%xmm6,%%xmm3\n" + "movdqa %%xmm1,0x10(%1)\n" + "por %%xmm7,%%xmm3\n" + "movdqa %%xmm3,0x30(%1)\n" + "lea 0x40(%1),%1\n" + "sub $0x10,%2\n" + "ja 1b\n" + : + : "r"(src_raw), // %0 + "r"(dst_argb), // %1 + "r"(pix), // %2 + "r"(kShuffleMaskRAWToARGB) // %3 + : "memory" +); +} #endif static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) { // Copy a Y to RGB. for (int x = 0; x < pix; ++x) { - dst_argb[2] = dst_argb[1] = dst_argb[0] = src_y[0]; + uint8 y = src_y[0]; + dst_argb[2] = dst_argb[1] = dst_argb[0] = y; dst_argb[3] = 255u; dst_argb += 4; - src_y += 1; + ++src_y; } } // Convert I400 to ARGB. -void I400ToARGB(const uint8* src_y, int src_pitch_y, - uint8* dst_argb, int dst_pitch_argb, - int width, int height) { +int I400ToARGB(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix); #if defined(HAS_I400TOARGBROW_SSE2) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && (width % 8 == 0) && - IS_ALIGNED(src_y, 8) && (src_pitch_y % 8 == 0) && - IS_ALIGNED(dst_argb, 16) && (dst_pitch_argb % 16 == 0)) { + IS_ALIGNED(src_y, 8) && (src_stride_y % 8 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { I400ToARGBRow = I400ToARGBRow_SSE2; } else #endif @@ -1201,16 +1342,21 @@ void I400ToARGB(const uint8* src_y, int src_pitch_y, for (int y = 0; y < height; ++y) { I400ToARGBRow(src_y, dst_argb, width); - src_y += src_pitch_y; - dst_argb += dst_pitch_argb; + src_y += src_stride_y; + dst_argb += dst_stride_argb; } + return 0; } + static void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) { for (int x = 0; x < pix; ++x) { - dst_argb[0] = src_raw[2]; - dst_argb[1] = src_raw[1]; - dst_argb[2] = src_raw[0]; + uint8 r = src_raw[0]; + uint8 g = src_raw[1]; + uint8 b = src_raw[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; dst_argb[3] = 255u; dst_argb += 4; src_raw += 3; @@ -1218,21 +1364,44 @@ static void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) { } // Convert RAW to ARGB. -void RAWToARGB(const uint8* src_raw, int src_pitch_raw, - uint8* dst_argb, int dst_pitch_argb, - int width, int height) { - for (int y = 0; y < height; ++y) { - RAWToARGBRow_C(src_raw, dst_argb, width); - src_raw += src_pitch_raw; - dst_argb += dst_pitch_argb; +int RAWToARGB(const uint8* src_raw, int src_stride_raw, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; } + void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix); +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } else +#endif + { + RAWToARGBRow = RAWToARGBRow_C; + } + + for (int y = 0; y < height; ++y) { + RAWToARGBRow(src_raw, dst_argb, width); + src_raw += src_stride_raw; + dst_argb += dst_stride_argb; + } + return 0; } static void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) { for (int x = 0; x < pix; ++x) { - dst_argb[0] = src_bg24[0]; - dst_argb[1] = src_bg24[1]; - dst_argb[2] = src_bg24[2]; + uint8 b = src_bg24[0]; + uint8 g = src_bg24[1]; + uint8 r = src_bg24[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; dst_argb[3] = 255u; dst_argb += 4; src_bg24 += 3; @@ -1240,36 +1409,127 @@ static void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) { } // Convert BG24 to ARGB. -void BG24ToARGB(const uint8* src_bg24, int src_pitch_bg24, - uint8* dst_argb, int dst_pitch_argb, - int width, int height) { - for (int y = 0; y < height; ++y) { - BG24ToARGBRow_C(src_bg24, dst_argb, width); - src_bg24 += src_pitch_bg24; - dst_argb += dst_pitch_argb; +int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (height < 0) { + height = -height; + src_bg24 = src_bg24 + (height - 1) * src_stride_bg24; + src_stride_bg24 = -src_stride_bg24; } + void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix); +#if defined(HAS_BG24TOARGBROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { + BG24ToARGBRow = BG24ToARGBRow_SSSE3; + } else +#endif + { + BG24ToARGBRow = BG24ToARGBRow_C; + } + + for (int y = 0; y < height; ++y) { + BG24ToARGBRow(src_bg24, dst_argb, width); + src_bg24 += src_stride_bg24; + dst_argb += dst_stride_argb; + } + return 0; } + static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) { for (int x = 0; x < pix; ++x) { - dst_argb[0] = src_abgr[2]; - dst_argb[1] = src_abgr[1]; - dst_argb[2] = src_abgr[0]; - dst_argb[3] = src_abgr[3]; + // To support in-place conversion. + uint8 r = src_abgr[0]; + uint8 g = src_abgr[1]; + uint8 b = src_abgr[2]; + uint8 a = src_abgr[3]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; dst_argb += 4; src_abgr += 4; } } -// Convert ABGR to ARGB. -void ABGRToARGB(const uint8* src_abgr, int src_pitch_abgr, - uint8* dst_argb, int dst_pitch_argb, - int width, int height) { +int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } +void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix); +#if defined(HAS_ABGRTOARGBROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 4 == 0) && + IS_ALIGNED(src_abgr, 16) && (src_stride_abgr % 16 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { + ABGRToARGBRow = ABGRToARGBRow_SSSE3; + } else +#endif + { + ABGRToARGBRow = ABGRToARGBRow_C; + } + for (int y = 0; y < height; ++y) { - ABGRToARGBRow_C(src_abgr, dst_argb, width); - src_abgr += src_pitch_abgr; - dst_argb += dst_pitch_argb; + ABGRToARGBRow(src_abgr, dst_argb, width); + src_abgr += src_stride_abgr; + dst_argb += dst_stride_argb; + } + return 0; +} + +static void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) { + for (int x = 0; x < pix; ++x) { + // To support in-place conversion. + uint8 a = src_bgra[0]; + uint8 r = src_bgra[1]; + uint8 g = src_bgra[2]; + uint8 b = src_bgra[3]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; + dst_argb += 4; + src_bgra += 4; } } +// Convert BGRA to ARGB. +int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (height < 0) { + height = -height; + src_bgra = src_bgra + (height - 1) * src_stride_bgra; + src_stride_bgra = -src_stride_bgra; + } + void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix); +#if defined(HAS_BGRATOARGBROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 4 == 0) && + IS_ALIGNED(src_bgra, 16) && (src_stride_bgra % 16 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { + BGRAToARGBRow = BGRAToARGBRow_SSSE3; + } else +#endif + { + BGRAToARGBRow = BGRAToARGBRow_C; + } + + for (int y = 0; y < height; ++y) { + BGRAToARGBRow(src_bgra, dst_argb, width); + src_bgra += src_stride_bgra; + dst_argb += dst_stride_argb; + } + return 0; +} + } // namespace libyuv + + diff --git a/source/rotate.cc b/source/rotate.cc index 8075d47fb..7d2c512c8 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -19,31 +19,31 @@ typedef void (*rotate_wxhfunc)(const uint8*, int, uint8*, int, int, int); #ifdef __ARM_NEON__ extern "C" { void ReverseLine_NEON(const uint8* src, uint8* dst, int width); -void Transpose_wx8_NEON(const uint8* src, int src_pitch, - uint8* dst, int dst_pitch, int width); +void Transpose_wx8_NEON(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); } // extern "C" #endif -static void Transpose_wx8_C(const uint8* src, int src_pitch, - uint8* dst, int dst_pitch, +static void Transpose_wx8_C(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int w) { int i, j; for (i = 0; i < w; ++i) for (j = 0; j < 8; ++j) - dst[i * dst_pitch + j] = src[j * src_pitch + i]; + dst[i * dst_stride + j] = src[j * src_stride + i]; } -static void Transpose_wxh_C(const uint8* src, int src_pitch, - uint8* dst, int dst_pitch, +static void Transpose_wxh_C(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width, int height) { int i, j; for (i = 0; i < width; ++i) for (j = 0; j < height; ++j) - dst[i * dst_pitch + j] = src[j * src_pitch + i]; + dst[i * dst_stride + j] = src[j * src_stride + i]; } -void Transpose(const uint8* src, int src_pitch, - uint8* dst, int dst_pitch, +void Transpose(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width, int height) { int i = height; rotate_wx8func Transpose_wx8; @@ -60,33 +60,33 @@ void Transpose(const uint8* src, int src_pitch, // work across the source in 8x8 tiles do { - Transpose_wx8(src, src_pitch, dst, dst_pitch, width); + Transpose_wx8(src, src_stride, dst, dst_stride, width); - src += 8 * src_pitch; + src += 8 * src_stride; dst += 8; i -= 8; } while (i >= 8); // TODO(frkoenig): Have wx4 and maybe wx2 - Transpose_wxh(src, src_pitch, dst, dst_pitch, width, i); + Transpose_wxh(src, src_stride, dst, dst_stride, width, i); } -void Rotate90(const uint8* src, int src_pitch, - uint8* dst, int dst_pitch, +void Rotate90(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width, int height) { - src += src_pitch*(height-1); - src_pitch = -src_pitch; + src += src_stride*(height-1); + src_stride = -src_stride; - Transpose(src, src_pitch, dst, dst_pitch, width, height); + Transpose(src, src_stride, dst, dst_stride, width, height); } -void Rotate270(const uint8* src, int src_pitch, - uint8* dst, int dst_pitch, +void Rotate270(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width, int height) { - dst += dst_pitch*(width-1); - dst_pitch = -dst_pitch; + dst += dst_stride*(width-1); + dst_stride = -dst_stride; - Transpose(src, src_pitch, dst, dst_pitch, width, height); + Transpose(src, src_stride, dst, dst_stride, width, height); } void ReverseLine_C(const uint8* src, uint8* dst, int width) { @@ -95,8 +95,8 @@ void ReverseLine_C(const uint8* src, uint8* dst, int width) { dst[width-1 - i] = src[i]; } -void Rotate180(const uint8* src, int src_pitch, - uint8* dst, int dst_pitch, +void Rotate180(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width, int height) { int i; reverse_func ReverseLine; @@ -108,13 +108,13 @@ void Rotate180(const uint8* src, int src_pitch, ReverseLine = ReverseLine_C; #endif - dst += dst_pitch*(height-1); + dst += dst_stride*(height-1); for (i = 0; i < height; ++i) { ReverseLine(src, dst, width); - src += src_pitch; - dst -= dst_pitch; + src += src_stride; + dst -= dst_stride; } } diff --git a/source/rotate.h b/source/rotate.h index d15ad6709..f6a90ffe3 100644 --- a/source/rotate.h +++ b/source/rotate.h @@ -11,34 +11,35 @@ #ifndef LIBYUV_SOURCE_ROTATE_H_ #define LIBYUV_SOURCE_ROTATE_H_ -#include "basic_types.h" +#include "libyuv/basic_types.h" namespace libyuv { -void Rotate90(const uint8* src, int src_pitch, - uint8* dst, int dst_pitch, + +void Rotate90(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width, int height); -void Rotate180(const uint8* src, int src_pitch, - uint8* dst, int dst_pitch, +void Rotate180(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width, int height); -void Rotate270(const uint8* src, int src_pitch, - uint8* dst, int dst_pitch, +void Rotate270(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width, int height); -void Rotate90_deinterleave(const uint8* src, int src_pitch, - uint8* dst_a, int dst_pitch_a, - uint8* dst_b, int dst_pitch_b, +void Rotate90_deinterleave(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width, int height); -void Rotate180_deinterleave(const uint8* src, int src_pitch, - uint8* dst_a, int dst_pitch_a, - uint8* dst_b, int dst_pitch_b, +void Rotate180_deinterleave(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width, int height); -void Rotate270_deinterleave(const uint8* src, int src_pitch, - uint8* dst_a, int dst_pitch_a, - uint8* dst_b, int dst_pitch_b, +void Rotate270_deinterleave(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width, int height); -void Transpose(const uint8* src, int src_pitch, - uint8* dst, int dst_pitch, +void Transpose(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width, int height); } // namespace libyuv diff --git a/source/rotate_deinterleave.cc b/source/rotate_deinterleave.cc index fcbb0d42b..071335d73 100644 --- a/source/rotate_deinterleave.cc +++ b/source/rotate_deinterleave.cc @@ -27,40 +27,40 @@ void ReverseLine_di_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width); void SaveRegisters_NEON(unsigned long long *store); -void Transpose_di_wx8_NEON(const uint8* src, int src_pitch, - uint8* dst_a, int dst_pitch_a, - uint8* dst_b, int dst_pitch_b, +void Transpose_di_wx8_NEON(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width); } // extern "C" #endif -static void Transpose_di_wx8_C(const uint8* src, int src_pitch, - uint8* dst_a, int dst_pitch_a, - uint8* dst_b, int dst_pitch_b, +static void Transpose_di_wx8_C(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int w) { int i, j; for (i = 0; i < w*2; i += 2) for (j = 0; j < 8; ++j) { - dst_a[j + (i>>1)*dst_pitch_a] = src[i + j*src_pitch]; - dst_b[j + (i>>1)*dst_pitch_b] = src[i + j*src_pitch + 1]; + dst_a[j + (i>>1)*dst_stride_a] = src[i + j*src_stride]; + dst_b[j + (i>>1)*dst_stride_b] = src[i + j*src_stride + 1]; } } -static void Transpose_di_wxh_C(const uint8* src, int src_pitch, - uint8* dst_a, int dst_pitch_a, - uint8* dst_b, int dst_pitch_b, +static void Transpose_di_wxh_C(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int w, int h) { int i, j; for (i = 0; i < w*2; i += 2) for (j = 0; j < h; ++j) { - dst_a[j + (i>>1)*dst_pitch_a] = src[i + j*src_pitch]; - dst_b[j + (i>>1)*dst_pitch_b] = src[i + j*src_pitch + 1]; + dst_a[j + (i>>1)*dst_stride_a] = src[i + j*src_stride]; + dst_b[j + (i>>1)*dst_stride_b] = src[i + j*src_stride + 1]; } } -void Transpose_deinterleave(const uint8* src, int src_pitch, - uint8* dst_a, int dst_pitch_a, - uint8* dst_b, int dst_pitch_b, +void Transpose_deinterleave(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width, int height) { int i = height; rotate_wx8func Transpose_wx8; @@ -81,20 +81,20 @@ void Transpose_deinterleave(const uint8* src, int src_pitch, // work across the source in 8x8 tiles do { - Transpose_wx8(src, src_pitch, - dst_a, dst_pitch_a, - dst_b, dst_pitch_b, + Transpose_wx8(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, width); - src += 8 * src_pitch; + src += 8 * src_stride; dst_a += 8; dst_b += 8; i -= 8; } while (i >= 8); - Transpose_wxh(src, src_pitch, - dst_a, dst_pitch_a, - dst_b, dst_pitch_b, + Transpose_wxh(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, width, i); #ifdef __ARM_NEON__ @@ -102,31 +102,31 @@ void Transpose_deinterleave(const uint8* src, int src_pitch, #endif } -void Rotate90_deinterleave(const uint8* src, int src_pitch, - uint8* dst_a, int dst_pitch_a, - uint8* dst_b, int dst_pitch_b, +void Rotate90_deinterleave(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width, int height) { - src += src_pitch*(height-1); - src_pitch = -src_pitch; + src += src_stride*(height-1); + src_stride = -src_stride; - Transpose_deinterleave(src, src_pitch, - dst_a, dst_pitch_a, - dst_b, dst_pitch_b, + Transpose_deinterleave(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, width, height); } -void Rotate270_deinterleave(const uint8* src, int src_pitch, - uint8* dst_a, int dst_pitch_a, - uint8* dst_b, int dst_pitch_b, +void Rotate270_deinterleave(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width, int height) { - dst_a += dst_pitch_a*((width>>1)-1); - dst_b += dst_pitch_b*((width>>1)-1); - dst_pitch_a = -dst_pitch_a; - dst_pitch_b = -dst_pitch_b; + dst_a += dst_stride_a*((width>>1)-1); + dst_b += dst_stride_b*((width>>1)-1); + dst_stride_a = -dst_stride_a; + dst_stride_b = -dst_stride_b; - Transpose_deinterleave(src, src_pitch, - dst_a, dst_pitch_a, - dst_b, dst_pitch_b, + Transpose_deinterleave(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, width, height); } @@ -140,9 +140,9 @@ static void ReverseLine_di_C(const uint8* src, } } -void Rotate180_deinterleave(const uint8* src, int src_pitch, - uint8* dst_a, int dst_pitch_a, - uint8* dst_b, int dst_pitch_b, +void Rotate180_deinterleave(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width, int height) { int i; reverse_func ReverseLine; @@ -154,17 +154,17 @@ void Rotate180_deinterleave(const uint8* src, int src_pitch, ReverseLine = ReverseLine_di_C; #endif - dst_a += dst_pitch_a*(height-1); - dst_b += dst_pitch_b*(height-1); + dst_a += dst_stride_a*(height-1); + dst_b += dst_stride_b*(height-1); width >>= 1; for (i = 0; i < height; ++i) { ReverseLine(src, dst_a, dst_b, width); - src += src_pitch; - dst_a -= dst_pitch_a; - dst_b -= dst_pitch_b; + src += src_stride; + dst_a -= dst_stride_a; + dst_b -= dst_stride_b; } } diff --git a/source/row.h b/source/row.h index 67119b553..a11d80251 100644 --- a/source/row.h +++ b/source/row.h @@ -11,7 +11,7 @@ #ifndef LIBYUV_SOURCE_ROW_H_ #define LIBYUV_SOURCE_ROW_H_ -#include "basic_types.h" +#include "libyuv/basic_types.h" extern "C" { void FastConvertYUVToRGB32Row(const uint8* y_buf, diff --git a/source/scale.cc b/source/scale.cc index 2f802c93e..c87621294 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -8,37 +8,24 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "scale.h" +#include "libyuv/scale.h" #include #include -#include "cpu_id.h" +#include "libyuv/cpu_id.h" + +#if defined(_MSC_VER) +#define ALIGN16(var) __declspec(align(16)) var +#else +#define ALIGN16(var) var __attribute__((aligned(16))) +#endif // Note: A Neon reference manual // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html // Note: Some SSE2 reference manuals // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf -// TODO(fbarchard): Remove once performance is known -//#define TEST_RSTSC - -#if defined(TEST_RSTSC) -#include -#include -#ifdef _MSC_VER -#include -#endif - -#if defined(__GNUC__) && defined(__i386__) -static inline uint64 __rdtsc(void) { - uint32_t a, d; - __asm__ volatile("rdtsc" : "=a" (a), "=d" (d)); - return ((uint64)d << 32) + a; -} -#endif -#endif - namespace libyuv { // Set the following flag to true to revert to only @@ -47,7 +34,7 @@ namespace libyuv { // when comparing the quality of the resulting YUV planes // as produced by the optimized and non-optimized versions. -bool use_reference_impl_ = false; +static bool use_reference_impl_ = false; void SetUseReferenceImpl(bool use) { use_reference_impl_ = use; @@ -62,8 +49,8 @@ void SetUseReferenceImpl(bool use) { #if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED) #define HAS_SCALEROWDOWN2_NEON -void ScaleRowDown2_NEON(const uint8* iptr, int32 /* istride */, - uint8* dst, int32 owidth) { +void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */, + uint8* dst, int dst_width) { __asm__ volatile ( "1:\n" @@ -72,13 +59,13 @@ void ScaleRowDown2_NEON(const uint8* iptr, int32 /* istride */, "subs %2, %2, #16 \n" // 16 processed per loop "bhi 1b \n" : // Output registers - : "r"(iptr), "r"(dst), "r"(owidth) // Input registers + : "r"(src_ptr), "r"(dst), "r"(dst_width) // Input registers : "r4", "q0", "q1" // Clobber List ); } -void ScaleRowDown2Int_NEON(const uint8* iptr, int32 istride, - uint8* dst, int32 owidth) { +void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { __asm__ volatile ( "mov r4, #2 \n" // rounding constant @@ -99,7 +86,7 @@ void ScaleRowDown2Int_NEON(const uint8* iptr, int32 istride, "subs %3, %3, #16 \n" // 16 processed per loop "bhi 1b \n" : // Output registers - : "r"(iptr), "r"(istride), "r"(dst), "r"(owidth) // Input registers + : "r"(src_ptr), "r"(src_stride), "r"(dst), "r"(dst_width) // Input registers : "r4", "q0", "q1", "q2", "q3", "q4" // Clobber List ); } @@ -201,15 +188,15 @@ extern "C" TALIGN16(const uint16, scaleab2[8]) = #define HAS_SCALEROWDOWN2_SSE2 // Reads 32 pixels, throws half away and writes 16 pixels. -// Alignment requirement: iptr 16 byte aligned, optr 16 byte aligned. +// Alignment requirement: src_ptr 16 byte aligned, optr 16 byte aligned. __declspec(naked) -static void ScaleRowDown2_SSE2(const uint8* iptr, int32 istride, - uint8* optr, int32 owidth) { +static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, + uint8* optr, int dst_width) { __asm { - mov eax, [esp + 4] // iptr - // istride ignored + mov eax, [esp + 4] // src_ptr + // src_stride ignored mov edx, [esp + 12] // optr - mov ecx, [esp + 16] // owidth + mov ecx, [esp + 16] // dst_width pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff psrlw xmm7, 8 @@ -229,16 +216,16 @@ static void ScaleRowDown2_SSE2(const uint8* iptr, int32 istride, } } // Blends 32x2 rectangle to 16x1. -// Alignment requirement: iptr 16 byte aligned, optr 16 byte aligned. +// Alignment requirement: src_ptr 16 byte aligned, optr 16 byte aligned. __declspec(naked) -static void ScaleRowDown2Int_SSE2(const uint8* iptr, int32 istride, - uint8* optr, int32 owidth) { +static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* optr, int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // iptr - mov esi, [esp + 4 + 8] // istride + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // optr - mov ecx, [esp + 4 + 16] // owidth + mov ecx, [esp + 4 + 16] // dst_width pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff psrlw xmm7, 8 @@ -273,16 +260,16 @@ static void ScaleRowDown2Int_SSE2(const uint8* iptr, int32 istride, #define HAS_SCALEROWDOWN4_SSE2 // Point samples 32 pixels to 8 pixels. -// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned. +// Alignment requirement: src_ptr 16 byte aligned, optr 8 byte aligned. __declspec(naked) -static void ScaleRowDown4_SSE2(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth) { +static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width) { __asm { pushad - mov esi, [esp + 32 + 4] // iptr - // istride ignored + mov esi, [esp + 32 + 4] // src_ptr + // src_stride ignored mov edi, [esp + 32 + 12] // orow - mov ecx, [esp + 32 + 16] // owidth + mov ecx, [esp + 32 + 16] // dst_width pcmpeqb xmm7, xmm7 // generate mask 0x000000ff psrld xmm7, 24 @@ -305,19 +292,19 @@ static void ScaleRowDown4_SSE2(const uint8* iptr, int32 istride, } // Blends 32x4 rectangle to 8x1. -// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned. +// Alignment requirement: src_ptr 16 byte aligned, optr 8 byte aligned. __declspec(naked) -static void ScaleRowDown4Int_SSE2(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth) { +static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width) { __asm { pushad - mov esi, [esp + 32 + 4] // iptr - mov ebx, [esp + 32 + 8] // istride + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride mov edi, [esp + 32 + 12] // orow - mov ecx, [esp + 32 + 16] // owidth + mov ecx, [esp + 32 + 16] // dst_width pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff psrlw xmm7, 8 - lea edx, [ebx + ebx * 2] // istride * 3 + lea edx, [ebx + ebx * 2] // src_stride * 3 wloop: movdqa xmm0, [esi] @@ -364,17 +351,17 @@ static void ScaleRowDown4Int_SSE2(const uint8* iptr, int32 istride, #define HAS_SCALEROWDOWN8_SSE2 // Point samples 32 pixels to 4 pixels. -// Alignment requirement: iptr 16 byte aligned, optr 4 byte aligned. +// Alignment requirement: src_ptr 16 byte aligned, optr 4 byte aligned. __declspec(naked) -static void ScaleRowDown8_SSE2(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth) { +static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width) { __asm { pushad - mov esi, [esp + 32 + 4] // iptr - // istride ignored + mov esi, [esp + 32 + 4] // src_ptr + // src_stride ignored mov edi, [esp + 32 + 12] // orow - mov ecx, [esp + 32 + 16] // owidth - pcmpeqb xmm7, xmm7 // generate mask isolating 1 in 8 bytes + mov ecx, [esp + 32 + 16] // dst_width + pcmpeqb xmm7, xmm7 // generate mask isolating 1 src 8 bytes psrlq xmm7, 56 wloop: @@ -397,17 +384,17 @@ static void ScaleRowDown8_SSE2(const uint8* iptr, int32 istride, } // Blends 32x8 rectangle to 4x1. -// Alignment requirement: iptr 16 byte aligned, optr 4 byte aligned. +// Alignment requirement: src_ptr 16 byte aligned, optr 4 byte aligned. __declspec(naked) -static void ScaleRowDown8Int_SSE2(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth) { +static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width) { __asm { pushad - mov esi, [esp + 32 + 4] // iptr - mov ebx, [esp + 32 + 8] // istride + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride mov edi, [esp + 32 + 12] // orow - mov ecx, [esp + 32 + 16] // owidth - lea edx, [ebx + ebx * 2] // istride * 3 + mov ecx, [esp + 32 + 16] // dst_width + lea edx, [ebx + ebx * 2] // src_stride * 3 pxor xmm7, xmm7 wloop: @@ -470,16 +457,16 @@ static void ScaleRowDown8Int_SSE2(const uint8* iptr, int32 istride, // Then shuffled to do the scaling. // Note that movdqa+palign may be better than movdqu. -// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned. +// Alignment requirement: src_ptr 16 byte aligned, optr 8 byte aligned. __declspec(naked) -static void ScaleRowDown34_SSSE3(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth) { +static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width) { __asm { pushad - mov esi, [esp + 32 + 4] // iptr - // istride ignored + mov esi, [esp + 32 + 4] // src_ptr + // src_stride ignored mov edi, [esp + 32 + 12] // orow - mov ecx, [esp + 32 + 16] // owidth + mov ecx, [esp + 32 + 16] // dst_width movdqa xmm3, _shuf0 movdqa xmm4, _shuf1 movdqa xmm5, _shuf2 @@ -520,16 +507,16 @@ static void ScaleRowDown34_SSSE3(const uint8* iptr, int32 istride, // xmm7 round34 // Note that movdqa+palign may be better than movdqu. -// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned. +// Alignment requirement: src_ptr 16 byte aligned, optr 8 byte aligned. __declspec(naked) -static void ScaleRowDown34_1_Int_SSSE3(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth) { +static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width) { __asm { pushad - mov esi, [esp + 32 + 4] // iptr - mov ebx, [esp + 32 + 8] // istride + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride mov edi, [esp + 32 + 12] // orow - mov ecx, [esp + 32 + 16] // owidth + mov ecx, [esp + 32 + 16] // dst_width movdqa xmm2, _shuf01 movdqa xmm3, _shuf11 movdqa xmm4, _shuf21 @@ -577,16 +564,16 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* iptr, int32 istride, } // Note that movdqa+palign may be better than movdqu. -// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned. +// Alignment requirement: src_ptr 16 byte aligned, optr 8 byte aligned. __declspec(naked) -static void ScaleRowDown34_0_Int_SSSE3(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth) { +static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width) { __asm { pushad - mov esi, [esp + 32 + 4] // iptr - mov ebx, [esp + 32 + 8] // istride + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride mov edi, [esp + 32 + 12] // orow - mov ecx, [esp + 32 + 16] // owidth + mov ecx, [esp + 32 + 16] // dst_width movdqa xmm2, _shuf01 movdqa xmm3, _shuf11 movdqa xmm4, _shuf21 @@ -641,14 +628,14 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* iptr, int32 istride, // Scale 32 pixels to 12 __declspec(naked) -static void ScaleRowDown38_SSSE3(const uint8* iptr, int32 istride, - uint8* optr, int32 owidth) { +static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, + uint8* optr, int dst_width) { __asm { pushad - mov esi, [esp + 32 + 4] // iptr - mov edx, [esp + 32 + 8] // istride + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride mov edi, [esp + 32 + 12] // optr - mov ecx, [esp + 32 + 16] // owidth + mov ecx, [esp + 32 + 16] // dst_width movdqa xmm5, _shuf38a movdqa xmm6, _shuf38b pxor xmm7, xmm7 @@ -675,14 +662,14 @@ static void ScaleRowDown38_SSSE3(const uint8* iptr, int32 istride, // Scale 16x3 pixels to 6x1 with interpolation __declspec(naked) -static void ScaleRowDown38_3_Int_SSSE3(const uint8* iptr, int32 istride, - uint8* optr, int32 owidth) { +static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* optr, int dst_width) { __asm { pushad - mov esi, [esp + 32 + 4] // iptr - mov edx, [esp + 32 + 8] // istride + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride mov edi, [esp + 32 + 12] // optr - mov ecx, [esp + 32 + 16] // owidth + mov ecx, [esp + 32 + 16] // dst_width movdqa xmm4, _shufac0 movdqa xmm5, _shufac3 movdqa xmm6, _scaleac3 @@ -739,14 +726,14 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* iptr, int32 istride, // Scale 16x2 pixels to 6x1 with interpolation __declspec(naked) -static void ScaleRowDown38_2_Int_SSSE3(const uint8* iptr, int32 istride, - uint8* optr, int32 owidth) { +static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* optr, int dst_width) { __asm { pushad - mov esi, [esp + 32 + 4] // iptr - mov edx, [esp + 32 + 8] // istride + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride mov edi, [esp + 32 + 12] // optr - mov ecx, [esp + 32 + 16] // owidth + mov ecx, [esp + 32 + 16] // dst_width movdqa xmm4, _shufab0 movdqa xmm5, _shufab1 movdqa xmm6, _shufab2 @@ -784,14 +771,14 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* iptr, int32 istride, // Reads 8xN bytes and produces 16 shorts at a time. __declspec(naked) -static void ScaleAddRows_SSE2(const uint8* iptr, int32 istride, - uint16* orow, int32 iwidth, int32 iheight) { +static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, + uint16* orow, int src_width, int src_height) { __asm { pushad - mov esi, [esp + 32 + 4] // iptr - mov edx, [esp + 32 + 8] // istride + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride mov edi, [esp + 32 + 12] // orow - mov ecx, [esp + 32 + 16] // owidth + mov ecx, [esp + 32 + 16] // dst_width mov ebx, [esp + 32 + 20] // height pxor xmm7, xmm7 dec ebx @@ -833,15 +820,15 @@ static void ScaleAddRows_SSE2(const uint8* iptr, int32 istride, // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version. #define HAS_SCALEFILTERROWS_SSE2 __declspec(naked) -static void ScaleFilterRows_SSE2(uint8* optr, const uint8* iptr0, int32 istride, - int owidth, int source_y_fraction) { +static void ScaleFilterRows_SSE2(uint8* optr, const uint8* iptr0, int src_stride, + int dst_width, int source_y_fraction) { __asm { push esi push edi mov edi, [esp + 8 + 4] // optr mov esi, [esp + 8 + 8] // iptr0 - mov edx, [esp + 8 + 12] // istride - mov ecx, [esp + 8 + 16] // owidth + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) cmp eax, 0 je xloop1 @@ -923,15 +910,15 @@ static void ScaleFilterRows_SSE2(uint8* optr, const uint8* iptr0, int32 istride, // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version. #define HAS_SCALEFILTERROWS_SSSE3 __declspec(naked) -static void ScaleFilterRows_SSSE3(uint8* optr, const uint8* iptr0, int32 istride, - int owidth, int source_y_fraction) { +static void ScaleFilterRows_SSSE3(uint8* optr, const uint8* iptr0, int src_stride, + int dst_width, int source_y_fraction) { __asm { push esi push edi mov edi, [esp + 8 + 4] // optr mov esi, [esp + 8 + 8] // iptr0 - mov edx, [esp + 8 + 12] // istride - mov ecx, [esp + 8 + 16] // owidth + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) cmp eax, 0 je xloop1 @@ -1003,14 +990,14 @@ static void ScaleFilterRows_SSSE3(uint8* optr, const uint8* iptr0, int32 istride } // Note that movdqa+palign may be better than movdqu. -// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned. +// Alignment requirement: src_ptr 16 byte aligned, optr 8 byte aligned. __declspec(naked) -static void ScaleFilterCols34_SSSE3(uint8* optr, const uint8* iptr, - int owidth) { +static void ScaleFilterCols34_SSSE3(uint8* optr, const uint8* src_ptr, + int dst_width) { __asm { mov edx, [esp + 4] // optr - mov eax, [esp + 8] // iptr - mov ecx, [esp + 12] // owidth + mov eax, [esp + 8] // src_ptr + mov ecx, [esp + 12] // dst_width movdqa xmm1, _round34 movdqa xmm2, _shuf01 movdqa xmm3, _shuf11 @@ -1056,8 +1043,8 @@ static void ScaleFilterCols34_SSSE3(uint8* optr, const uint8* iptr, // Generated using gcc disassembly on Visual C object file: // objdump -D yuvscaler.obj >yuvscaler.txt #define HAS_SCALEROWDOWN2_SSE2 -extern "C" void ScaleRowDown2_SSE2(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth); +extern "C" void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width); asm( ".text\n" #if defined(OSX) @@ -1087,8 +1074,8 @@ extern "C" void ScaleRowDown2_SSE2(const uint8* iptr, int32 istride, "ret\n" ); -extern "C" void ScaleRowDown2Int_SSE2(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth); +extern "C" void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width); asm( ".text\n" #if defined(OSX) @@ -1132,8 +1119,8 @@ extern "C" void ScaleRowDown2Int_SSE2(const uint8* iptr, int32 istride, ); #define HAS_SCALEROWDOWN4_SSE2 -extern "C" void ScaleRowDown4_SSE2(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth); +extern "C" void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width); asm( ".text\n" #if defined(OSX) @@ -1166,8 +1153,8 @@ extern "C" void ScaleRowDown4_SSE2(const uint8* iptr, int32 istride, "ret\n" ); -extern "C" void ScaleRowDown4Int_SSE2(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth); +extern "C" void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width); asm( ".text\n" #if defined(OSX) @@ -1225,8 +1212,8 @@ extern "C" void ScaleRowDown4Int_SSE2(const uint8* iptr, int32 istride, ); #define HAS_SCALEROWDOWN8_SSE2 -extern "C" void ScaleRowDown8_SSE2(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth); +extern "C" void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width); asm( ".text\n" #if defined(OSX) @@ -1260,8 +1247,8 @@ extern "C" void ScaleRowDown8_SSE2(const uint8* iptr, int32 istride, "ret\n" ); -extern "C" void ScaleRowDown8Int_SSE2(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth); +extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width); asm( ".text\n" #if defined(OSX) @@ -1331,8 +1318,8 @@ extern "C" void ScaleRowDown8Int_SSE2(const uint8* iptr, int32 istride, // fpic is used for magiccam plugin #if !defined(__PIC__) #define HAS_SCALEROWDOWN34_SSSE3 -extern "C" void ScaleRowDown34_SSSE3(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth); +extern "C" void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width); asm( ".text\n" #if defined(OSX) @@ -1369,8 +1356,8 @@ extern "C" void ScaleRowDown34_SSSE3(const uint8* iptr, int32 istride, "ret\n" ); -extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth); +extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width); asm( ".text\n" #if defined(OSX) @@ -1430,8 +1417,8 @@ extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* iptr, int32 istride, "ret\n" ); -extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth); +extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width); asm( ".text\n" #if defined(OSX) @@ -1494,8 +1481,8 @@ extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* iptr, int32 istride, ); #define HAS_SCALEROWDOWN38_SSSE3 -extern "C" void ScaleRowDown38_SSSE3(const uint8* iptr, int32 istride, - uint8* optr, int32 owidth); +extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, + uint8* optr, int dst_width); asm( ".text\n" #if defined(OSX) @@ -1531,8 +1518,8 @@ extern "C" void ScaleRowDown38_SSSE3(const uint8* iptr, int32 istride, "ret\n" ); -extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* iptr, int32 istride, - uint8* optr, int32 owidth); +extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* optr, int dst_width); asm( ".text\n" #if defined(OSX) @@ -1595,8 +1582,8 @@ extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* iptr, int32 istride, "ret\n" ); -extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* iptr, int32 istride, - uint8* optr, int32 owidth); +extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* optr, int dst_width); asm( ".text\n" #if defined(OSX) @@ -1641,8 +1628,8 @@ extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* iptr, int32 istride, #endif // __PIC__ #define HAS_SCALEADDROWS_SSE2 -extern "C" void ScaleAddRows_SSE2(const uint8* iptr, int32 istride, - uint16* orow, int32 iwidth, int32 iheight); +extern "C" void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, + uint16* orow, int src_width, int src_height); asm( ".text\n" #if defined(OSX) @@ -1692,8 +1679,8 @@ extern "C" void ScaleAddRows_SSE2(const uint8* iptr, int32 istride, // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version #define HAS_SCALEFILTERROWS_SSE2 extern "C" void ScaleFilterRows_SSE2(uint8* optr, - const uint8* iptr0, int32 istride, - int owidth, int source_y_fraction); + const uint8* iptr0, int src_stride, + int dst_width, int source_y_fraction); asm( ".text\n" #if defined(OSX) @@ -1787,8 +1774,8 @@ extern "C" void ScaleFilterRows_SSE2(uint8* optr, // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version #define HAS_SCALEFILTERROWS_SSSE3 extern "C" void ScaleFilterRows_SSSE3(uint8* optr, - const uint8* iptr0, int32 istride, - int owidth, int source_y_fraction); + const uint8* iptr0, int src_stride, + int dst_width, int source_y_fraction); asm( ".text\n" #if defined(OSX) @@ -1870,42 +1857,42 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* optr, #endif // CPU agnostic row functions -static void ScaleRowDown2_C(const uint8* iptr, int32, - uint8* dst, int32 owidth) { - for (int x = 0; x < owidth; ++x) { - *dst++ = *iptr; - iptr += 2; +static void ScaleRowDown2_C(const uint8* src_ptr, int, + uint8* dst, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + *dst++ = *src_ptr; + src_ptr += 2; } } -static void ScaleRowDown2Int_C(const uint8* iptr, int32 istride, - uint8* dst, int32 owidth) { - for (int x = 0; x < owidth; ++x) { - *dst++ = (iptr[0] + iptr[1] + - iptr[istride] + iptr[istride + 1] + 2) >> 2; - iptr += 2; +static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + *dst++ = (src_ptr[0] + src_ptr[1] + + src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2; + src_ptr += 2; } } -static void ScaleRowDown4_C(const uint8* iptr, int32, - uint8* dst, int32 owidth) { - for (int x = 0; x < owidth; ++x) { - *dst++ = *iptr; - iptr += 4; +static void ScaleRowDown4_C(const uint8* src_ptr, int, + uint8* dst, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + *dst++ = *src_ptr; + src_ptr += 4; } } -static void ScaleRowDown4Int_C(const uint8* iptr, int32 istride, - uint8* dst, int32 owidth) { - for (int x = 0; x < owidth; ++x) { - *dst++ = (iptr[0] + iptr[1] + iptr[2] + iptr[3] + - iptr[istride + 0] + iptr[istride + 1] + - iptr[istride + 2] + iptr[istride + 3] + - iptr[istride * 2 + 0] + iptr[istride * 2 + 1] + - iptr[istride * 2 + 2] + iptr[istride * 2 + 3] + - iptr[istride * 3 + 0] + iptr[istride * 3 + 1] + - iptr[istride * 3 + 2] + iptr[istride * 3 + 3] + 8) >> 4; - iptr += 4; +static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + + src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + + src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + + src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + + src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + + src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 8) >> 4; + src_ptr += 4; } } @@ -1914,46 +1901,46 @@ static void ScaleRowDown4Int_C(const uint8* iptr, int32 istride, static const int kMaxOutputWidth = 640; static const int kMaxRow12 = kMaxOutputWidth * 2; -static void ScaleRowDown8_C(const uint8* iptr, int32, - uint8* dst, int32 owidth) { - for (int x = 0; x < owidth; ++x) { - *dst++ = *iptr; - iptr += 8; +static void ScaleRowDown8_C(const uint8* src_ptr, int, + uint8* dst, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + *dst++ = *src_ptr; + src_ptr += 8; } } // Note calling code checks width is less than max and if not // uses ScaleRowDown8_C instead. -static void ScaleRowDown8Int_C(const uint8* iptr, int32 istride, - uint8* dst, int32 owidth) { +static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { ALIGN16(uint8 irow[kMaxRow12 * 2]); - assert(owidth <= kMaxOutputWidth); - ScaleRowDown4Int_C(iptr, istride, irow, owidth * 2); - ScaleRowDown4Int_C(iptr + istride * 4, istride, irow + kMaxOutputWidth, - owidth * 2); - ScaleRowDown2Int_C(irow, kMaxOutputWidth, dst, owidth); + assert(dst_width <= kMaxOutputWidth); + ScaleRowDown4Int_C(src_ptr, src_stride, irow, dst_width * 2); + ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride, irow + kMaxOutputWidth, + dst_width * 2); + ScaleRowDown2Int_C(irow, kMaxOutputWidth, dst, dst_width); } -static void ScaleRowDown34_C(const uint8* iptr, int32, - uint8* dst, int32 owidth) { - assert((owidth % 3 == 0) && (owidth > 0)); - uint8* dend = dst + owidth; +static void ScaleRowDown34_C(const uint8* src_ptr, int, + uint8* dst, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + uint8* dend = dst + dst_width; do { - dst[0] = iptr[0]; - dst[1] = iptr[1]; - dst[2] = iptr[3]; + dst[0] = src_ptr[0]; + dst[1] = src_ptr[1]; + dst[2] = src_ptr[3]; dst += 3; - iptr += 4; + src_ptr += 4; } while (dst < dend); } // Filter rows 0 and 1 together, 3 : 1 -static void ScaleRowDown34_0_Int_C(const uint8* iptr, int32 istride, - uint8* d, int32 owidth) { - assert((owidth % 3 == 0) && (owidth > 0)); - uint8* dend = d + owidth; - const uint8* s = iptr; - const uint8* t = iptr + istride; +static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride, + uint8* d, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + uint8* dend = d + dst_width; + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; do { uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; @@ -1971,12 +1958,12 @@ static void ScaleRowDown34_0_Int_C(const uint8* iptr, int32 istride, } // Filter rows 1 and 2 together, 1 : 1 -static void ScaleRowDown34_1_Int_C(const uint8* iptr, int32 istride, - uint8* d, int32 owidth) { - assert((owidth % 3 == 0) && (owidth > 0)); - uint8* dend = d + owidth; - const uint8* s = iptr; - const uint8* t = iptr + istride; +static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride, + uint8* d, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + uint8* dend = d + dst_width; + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; do { uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; @@ -1995,10 +1982,10 @@ static void ScaleRowDown34_1_Int_C(const uint8* iptr, int32 istride, #if defined(HAS_SCALEFILTERROWS_SSE2) // Filter row to 3/4 -static void ScaleFilterCols34_C(uint8* optr, const uint8* iptr, int owidth) { - assert((owidth % 3 == 0) && (owidth > 0)); - uint8* dend = optr + owidth; - const uint8* s = iptr; +static void ScaleFilterCols34_C(uint8* optr, const uint8* src_ptr, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + uint8* dend = optr + dst_width; + const uint8* s = src_ptr; do { optr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2; optr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1; @@ -2009,130 +1996,103 @@ static void ScaleFilterCols34_C(uint8* optr, const uint8* iptr, int owidth) { } #endif -static void ScaleFilterCols_C(uint8* optr, const uint8* iptr, - int owidth, int dx) { +static void ScaleFilterCols_C(uint8* optr, const uint8* src_ptr, + int dst_width, int dx) { int x = 0; - for (int j = 0; j < owidth; ++j) { + for (int j = 0; j < dst_width; ++j) { int xi = x >> 16; int xf1 = x & 0xffff; int xf0 = 65536 - xf1; - *optr++ = (iptr[xi] * xf0 + iptr[xi + 1] * xf1) >> 16; + *optr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16; x += dx; } } -#ifdef TEST_RSTSC -uint64 timers34[4] = { 0, }; -#endif - static const int kMaxInputWidth = 2560; #if defined(HAS_SCALEFILTERROWS_SSE2) #define HAS_SCALEROWDOWN34_SSE2 // Filter rows 0 and 1 together, 3 : 1 -static void ScaleRowDown34_0_Int_SSE2(const uint8* iptr, int32 istride, - uint8* d, int32 owidth) { - assert((owidth % 3 == 0) && (owidth > 0)); +static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* d, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); ALIGN16(uint8 row[kMaxInputWidth]); -#ifdef TEST_RSTSC - uint64 t1 = __rdtsc(); -#endif - ScaleFilterRows_SSE2(row, iptr, istride, owidth * 4 / 3, 256 / 4); -#ifdef TEST_RSTSC - uint64 t2 = __rdtsc(); -#endif - ScaleFilterCols34_C(d, row, owidth); - -#ifdef TEST_RSTSC - uint64 t3 = __rdtsc(); - timers34[0] += t2 - t1; - timers34[1] += t3 - t2; -#endif + ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4); + ScaleFilterCols34_C(d, row, dst_width); } // Filter rows 1 and 2 together, 1 : 1 -static void ScaleRowDown34_1_Int_SSE2(const uint8* iptr, int32 istride, - uint8* d, int32 owidth) { - assert((owidth % 3 == 0) && (owidth > 0)); +static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* d, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); ALIGN16(uint8 row[kMaxInputWidth]); -#ifdef TEST_RSTSC - uint64 t1 = __rdtsc(); -#endif - ScaleFilterRows_SSE2(row, iptr, istride, owidth * 4 / 3, 256 / 2); -#ifdef TEST_RSTSC - uint64 t2 = __rdtsc(); -#endif - ScaleFilterCols34_C(d, row, owidth); -#ifdef TEST_RSTSC - uint64 t3 = __rdtsc(); - timers34[2] += t2 - t1; - timers34[3] += t3 - t2; -#endif + ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2); + ScaleFilterCols34_C(d, row, dst_width); } #endif -static void ScaleRowDown38_C(const uint8* iptr, int32, - uint8* dst, int32 owidth) { - assert(owidth % 3 == 0); - for (int x = 0; x < owidth; x += 3) { - dst[0] = iptr[0]; - dst[1] = iptr[3]; - dst[2] = iptr[6]; +static void ScaleRowDown38_C(const uint8* src_ptr, int, + uint8* dst, int dst_width) { + assert(dst_width % 3 == 0); + for (int x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[3]; + dst[2] = src_ptr[6]; dst += 3; - iptr += 8; + src_ptr += 8; } } // 8x3 -> 3x1 -static void ScaleRowDown38_3_Int_C(const uint8* iptr, int32 istride, - uint8* optr, int32 owidth) { - assert((owidth % 3 == 0) && (owidth > 0)); - for (int i = 0; i < owidth; i+=3) { - optr[0] = (iptr[0] + iptr[1] + iptr[2] + - iptr[istride + 0] + iptr[istride + 1] + iptr[istride + 2] + - iptr[istride * 2 + 0] + iptr[istride * 2 + 1] + iptr[istride * 2 + 2]) * +static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride, + uint8* optr, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (int i = 0; i < dst_width; i+=3) { + optr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + src_ptr[src_stride + 2] + + src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) * (65536 / 9) >> 16; - optr[1] = (iptr[3] + iptr[4] + iptr[5] + - iptr[istride + 3] + iptr[istride + 4] + iptr[istride + 5] + - iptr[istride * 2 + 3] + iptr[istride * 2 + 4] + iptr[istride * 2 + 5]) * + optr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + src_ptr[src_stride + 5] + + src_ptr[src_stride * 2 + 3] + src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) * (65536 / 9) >> 16; - optr[2] = (iptr[6] + iptr[7] + - iptr[istride + 6] + iptr[istride + 7] + - iptr[istride * 2 + 6] + iptr[istride * 2 + 7]) * + optr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[src_stride + 6] + src_ptr[src_stride + 7] + + src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) * (65536 / 6) >> 16; - iptr += 8; + src_ptr += 8; optr += 3; } } // 8x2 -> 3x1 -static void ScaleRowDown38_2_Int_C(const uint8* iptr, int32 istride, - uint8* optr, int32 owidth) { - assert((owidth % 3 == 0) && (owidth > 0)); - for (int i = 0; i < owidth; i+=3) { - optr[0] = (iptr[0] + iptr[1] + iptr[2] + - iptr[istride + 0] + iptr[istride + 1] + iptr[istride + 2]) * +static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride, + uint8* optr, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (int i = 0; i < dst_width; i+=3) { + optr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + src_ptr[src_stride + 2]) * (65536 / 6) >> 16; - optr[1] = (iptr[3] + iptr[4] + iptr[5] + - iptr[istride + 3] + iptr[istride + 4] + iptr[istride + 5]) * + optr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + src_ptr[src_stride + 5]) * (65536 / 6) >> 16; - optr[2] = (iptr[6] + iptr[7] + - iptr[istride + 6] + iptr[istride + 7]) * + optr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) * (65536 / 4) >> 16; - iptr += 8; + src_ptr += 8; optr += 3; } } // C version 8x2 -> 8x1 static void ScaleFilterRows_C(uint8* optr, - const uint8* iptr0, int32 istride, - int owidth, int source_y_fraction) { - assert(owidth > 0); + const uint8* iptr0, int src_stride, + int dst_width, int source_y_fraction) { + assert(dst_width > 0); int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; - const uint8* iptr1 = iptr0 + istride; - uint8* end = optr + owidth; + const uint8* iptr1 = iptr0 + src_stride; + uint8* end = optr + dst_width; do { optr[0] = (iptr0[0] * y0_fraction + iptr1[0] * y1_fraction) >> 8; optr[1] = (iptr0[1] * y0_fraction + iptr1[1] * y1_fraction) >> 8; @@ -2149,16 +2109,16 @@ static void ScaleFilterRows_C(uint8* optr, optr[0] = optr[-1]; } -void ScaleAddRows_C(const uint8* iptr, int32 istride, - uint16* orow, int32 iwidth, int32 iheight) { - assert(iwidth > 0); - assert(iheight > 0); - for (int x = 0; x < iwidth; ++x) { - const uint8* s = iptr + x; +void ScaleAddRows_C(const uint8* src_ptr, int src_stride, + uint16* orow, int src_width, int src_height) { + assert(src_width > 0); + assert(src_height > 0); + for (int x = 0; x < src_width; ++x) { + const uint8* s = src_ptr + x; int sum = 0; - for (int y = 0; y < iheight; ++y) { + for (int y = 0; y < src_height; ++y) { sum += s[0]; - s += istride; + s += src_stride; } orow[x] = sum; } @@ -2171,36 +2131,36 @@ void ScaleAddRows_C(const uint8* iptr, int32 istride, * its original size. * */ -static void ScalePlaneDown2(int32 iwidth, int32 iheight, - int32 owidth, int32 oheight, - int32 istride, int32 ostride, - const uint8 *iptr, uint8 *optr, - bool interpolate) { - assert(iwidth % 2 == 0); - assert(iheight % 2 == 0); - void (*ScaleRowDown2)(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth); +static void ScalePlaneDown2(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int ostride, + const uint8* src_ptr, uint8* optr, + FilterMode filtering) { + assert(src_width % 2 == 0); + assert(src_height % 2 == 0); + void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width); #if defined(HAS_SCALEROWDOWN2_NEON) if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && - (owidth % 16 == 0) && (istride % 16 == 0) && (ostride % 16 == 0) && - IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 16)) { - ScaleRowDown2 = interpolate ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON; + (dst_width % 16 == 0) && (src_stride % 16 == 0) && (ostride % 16 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(optr, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON; } else #endif #if defined(HAS_SCALEROWDOWN2_SSE2) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (owidth % 16 == 0) && IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 16)) { - ScaleRowDown2 = interpolate ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2; + (dst_width % 16 == 0) && IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(optr, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2; } else #endif { - ScaleRowDown2 = interpolate ? ScaleRowDown2Int_C : ScaleRowDown2_C; + ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C; } - for (int y = 0; y < oheight; ++y) { - ScaleRowDown2(iptr, istride, optr, owidth); - iptr += (istride << 1); + for (int y = 0; y < dst_height; ++y) { + ScaleRowDown2(src_ptr, src_stride, optr, dst_width); + src_ptr += (src_stride << 1); optr += ostride; } } @@ -2211,30 +2171,30 @@ static void ScalePlaneDown2(int32 iwidth, int32 iheight, * This is an optimized version for scaling down a plane to 1/4 of * its original size. */ -static void ScalePlaneDown4(int32 iwidth, int32 iheight, - int32 owidth, int32 oheight, - int32 istride, int32 ostride, - const uint8 *iptr, uint8 *optr, - bool interpolate) { - assert(iwidth % 4 == 0); - assert(iheight % 4 == 0); - void (*ScaleRowDown4)(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth); +static void ScalePlaneDown4(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int ostride, + const uint8* src_ptr, uint8* optr, + FilterMode filtering) { + assert(src_width % 4 == 0); + assert(src_height % 4 == 0); + void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width); #if defined(HAS_SCALEROWDOWN4_SSE2) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (owidth % 8 == 0) && (istride % 16 == 0) && (ostride % 8 == 0) && - IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 8)) { - ScaleRowDown4 = interpolate ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2; + (dst_width % 8 == 0) && (src_stride % 16 == 0) && (ostride % 8 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(optr, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2; } else #endif { - ScaleRowDown4 = interpolate ? ScaleRowDown4Int_C : ScaleRowDown4_C; + ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C; } - for (int y = 0; y < oheight; ++y) { - ScaleRowDown4(iptr, istride, optr, owidth); - iptr += (istride << 2); + for (int y = 0; y < dst_height; ++y) { + ScaleRowDown4(src_ptr, src_stride, optr, dst_width); + src_ptr += (src_stride << 2); optr += ostride; } } @@ -2246,30 +2206,30 @@ static void ScalePlaneDown4(int32 iwidth, int32 iheight, * of its original size. * */ -static void ScalePlaneDown8(int32 iwidth, int32 iheight, - int32 owidth, int32 oheight, - int32 istride, int32 ostride, - const uint8 *iptr, uint8 *optr, - bool interpolate) { - assert(iwidth % 8 == 0); - assert(iheight % 8 == 0); - void (*ScaleRowDown8)(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth); +static void ScalePlaneDown8(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int ostride, + const uint8* src_ptr, uint8* optr, + FilterMode filtering) { + assert(src_width % 8 == 0); + assert(src_height % 8 == 0); + void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width); #if defined(HAS_SCALEROWDOWN8_SSE2) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (owidth % 16 == 0) && owidth <= kMaxOutputWidth && - (istride % 16 == 0) && (ostride % 16 == 0) && - IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 16)) { - ScaleRowDown8 = interpolate ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2; + (dst_width % 16 == 0) && dst_width <= kMaxOutputWidth && + (src_stride % 16 == 0) && (ostride % 16 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(optr, 16)) { + ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2; } else #endif { - ScaleRowDown8 = interpolate && (owidth <= kMaxOutputWidth) ? + ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ? ScaleRowDown8Int_C : ScaleRowDown8_C; } - for (int y = 0; y < oheight; ++y) { - ScaleRowDown8(iptr, istride, optr, owidth); - iptr += (istride << 3); + for (int y = 0; y < dst_height; ++y) { + ScaleRowDown8(src_ptr, src_stride, optr, dst_width); + src_ptr += (src_stride << 3); optr += ostride; } } @@ -2280,21 +2240,21 @@ static void ScalePlaneDown8(int32 iwidth, int32 iheight, * Provided by Frank Barchard (fbarchard@google.com) * */ -static void ScalePlaneDown34(int32 iwidth, int32 iheight, - int32 owidth, int32 oheight, - int32 istride, int32 ostride, - const uint8* iptr, uint8* optr, - bool interpolate) { - assert(owidth % 3 == 0); - void (*ScaleRowDown34_0)(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth); - void (*ScaleRowDown34_1)(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth); +static void ScalePlaneDown34(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int ostride, + const uint8* src_ptr, uint8* optr, + FilterMode filtering) { + assert(dst_width % 3 == 0); + void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width); + void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width); #if defined(HAS_SCALEROWDOWN34_SSSE3) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (owidth % 24 == 0) && (istride % 16 == 0) && (ostride % 8 == 0) && - IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 8)) { - if (!interpolate) { + (dst_width % 24 == 0) && (src_stride % 16 == 0) && (ostride % 8 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(optr, 8)) { + if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_SSSE3; ScaleRowDown34_1 = ScaleRowDown34_SSSE3; } else { @@ -2305,15 +2265,15 @@ static void ScalePlaneDown34(int32 iwidth, int32 iheight, #endif #if defined(HAS_SCALEROWDOWN34_SSE2) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (owidth % 24 == 0) && (istride % 16 == 0) && (ostride % 8 == 0) && - IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 8) && - interpolate) { + (dst_width % 24 == 0) && (src_stride % 16 == 0) && (ostride % 8 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(optr, 8) && + filtering) { ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2; ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2; } else #endif { - if (!interpolate) { + if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_C; ScaleRowDown34_1 = ScaleRowDown34_C; } else { @@ -2322,35 +2282,28 @@ static void ScalePlaneDown34(int32 iwidth, int32 iheight, } } int irow = 0; - for (int y = 0; y < oheight; ++y) { + for (int y = 0; y < dst_height; ++y) { switch (irow) { case 0: - ScaleRowDown34_0(iptr, istride, optr, owidth); + ScaleRowDown34_0(src_ptr, src_stride, optr, dst_width); break; case 1: - ScaleRowDown34_1(iptr, istride, optr, owidth); + ScaleRowDown34_1(src_ptr, src_stride, optr, dst_width); break; case 2: - ScaleRowDown34_0(iptr + istride, -istride, optr, owidth); + ScaleRowDown34_0(src_ptr + src_stride, -src_stride, optr, dst_width); break; } ++irow; - iptr += istride; + src_ptr += src_stride; optr += ostride; if (irow >= 3) { - iptr += istride; + src_ptr += src_stride; irow = 0; } } - -#ifdef TEST_RSTSC - std::cout << "Timer34_0 Row " << std::setw(9) << timers34[0] - << " Column " << std::setw(9) << timers34[1] - << " Timer34_1 Row " << std::setw(9) << timers34[2] - << " Column " << std::setw(9) << timers34[3] << std::endl; -#endif } /** @@ -2361,21 +2314,21 @@ static void ScalePlaneDown34(int32 iwidth, int32 iheight, * * Reduces 16x3 to 6x1 */ -static void ScalePlaneDown38(int32 iwidth, int32 iheight, - int32 owidth, int32 oheight, - int32 istride, int32 ostride, - const uint8* iptr, uint8* optr, - bool interpolate) { - assert(owidth % 3 == 0); - void (*ScaleRowDown38_3)(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth); - void (*ScaleRowDown38_2)(const uint8* iptr, int32 istride, - uint8* orow, int32 owidth); +static void ScalePlaneDown38(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int ostride, + const uint8* src_ptr, uint8* optr, + FilterMode filtering) { + assert(dst_width % 3 == 0); + void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width); + void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride, + uint8* orow, int dst_width); #if defined(HAS_SCALEROWDOWN38_SSSE3) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (owidth % 24 == 0) && (istride % 16 == 0) && (ostride % 8 == 0) && - IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 8)) { - if (!interpolate) { + (dst_width % 24 == 0) && (src_stride % 16 == 0) && (ostride % 8 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(optr, 8)) { + if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_SSSE3; } else { @@ -2385,7 +2338,7 @@ static void ScalePlaneDown38(int32 iwidth, int32 iheight, } else #endif { - if (!interpolate) { + if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_C; ScaleRowDown38_2 = ScaleRowDown38_C; } else { @@ -2394,18 +2347,18 @@ static void ScalePlaneDown38(int32 iwidth, int32 iheight, } } int irow = 0; - for (int y = 0; y < oheight; ++y) { + for (int y = 0; y < dst_height; ++y) { switch (irow) { case 0: case 1: - ScaleRowDown38_3(iptr, istride, optr, owidth); - iptr += istride * 3; + ScaleRowDown38_3(src_ptr, src_stride, optr, dst_width); + src_ptr += src_stride * 3; ++irow; break; case 2: - ScaleRowDown38_2(iptr, istride, optr, owidth); - iptr += istride * 2; + ScaleRowDown38_2(src_ptr, src_stride, optr, dst_width); + src_ptr += src_stride * 2; irow = 0; break; } @@ -2413,65 +2366,65 @@ static void ScalePlaneDown38(int32 iwidth, int32 iheight, } } -inline static uint32 SumBox(int32 iboxwidth, int32 iboxheight, - int32 istride, const uint8 *iptr) { +inline static uint32 SumBox(int iboxwidth, int iboxheight, + int src_stride, const uint8* src_ptr) { assert(iboxwidth > 0); assert(iboxheight > 0); uint32 sum = 0u; for (int y = 0; y < iboxheight; ++y) { for (int x = 0; x < iboxwidth; ++x) { - sum += iptr[x]; + sum += src_ptr[x]; } - iptr += istride; + src_ptr += src_stride; } return sum; } -static void ScalePlaneBoxRow(int32 owidth, int32 boxheight, - int dx, int32 istride, - const uint8 *iptr, uint8 *optr) { +static void ScalePlaneBoxRow(int dst_width, int boxheight, + int dx, int src_stride, + const uint8* src_ptr, uint8* optr) { int x = 0; - for (int i = 0; i < owidth; ++i) { + for (int i = 0; i < dst_width; ++i) { int ix = x >> 16; x += dx; int boxwidth = (x >> 16) - ix; - *optr++ = SumBox(boxwidth, boxheight, istride, iptr + ix) / + *optr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) / (boxwidth * boxheight); } } -inline static uint32 SumPixels(int32 iboxwidth, const uint16 *iptr) { +inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { assert(iboxwidth > 0); uint32 sum = 0u; for (int x = 0; x < iboxwidth; ++x) { - sum += iptr[x]; + sum += src_ptr[x]; } return sum; } -static void ScaleAddCols2_C(int32 owidth, int32 boxheight, int dx, - const uint16 *iptr, uint8 *optr) { +static void ScaleAddCols2_C(int dst_width, int boxheight, int dx, + const uint16* src_ptr, uint8* optr) { int scaletbl[2]; int minboxwidth = (dx >> 16); scaletbl[0] = 65536 / (minboxwidth * boxheight); scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); int *scaleptr = scaletbl - minboxwidth; int x = 0; - for (int i = 0; i < owidth; ++i) { + for (int i = 0; i < dst_width; ++i) { int ix = x >> 16; x += dx; int boxwidth = (x >> 16) - ix; - *optr++ = SumPixels(boxwidth, iptr + ix) * scaleptr[boxwidth] >> 16; + *optr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; } } -static void ScaleAddCols1_C(int32 owidth, int32 boxheight, int dx, - const uint16 *iptr, uint8 *optr) { +static void ScaleAddCols1_C(int dst_width, int boxheight, int dx, + const uint16* src_ptr, uint8* optr) { int boxwidth = (dx >> 16); int scaleval = 65536 / (boxwidth * boxheight); int x = 0; - for (int i = 0; i < owidth; ++i) { - *optr++ = SumPixels(boxwidth, iptr + x) * scaleval >> 16; + for (int i = 0; i < dst_width; ++i) { + *optr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; x += boxwidth; } } @@ -2485,43 +2438,43 @@ static void ScaleAddCols1_C(int32 owidth, int32 boxheight, int dx, * through source, sampling a box of pixel with simple * averaging. */ -static void ScalePlaneBox(int32 iwidth, int32 iheight, - int32 owidth, int32 oheight, - int32 istride, int32 ostride, - const uint8 *iptr, uint8 *optr) { - assert(owidth > 0); - assert(oheight > 0); - int dy = (iheight << 16) / oheight; - int dx = (iwidth << 16) / owidth; - if ((iwidth % 16 != 0) || (iwidth > kMaxInputWidth) || - oheight * 2 > iheight) { - uint8 *dst = optr; - int dy = (iheight << 16) / oheight; - int dx = (iwidth << 16) / owidth; +static void ScalePlaneBox(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int ostride, + const uint8* src_ptr, uint8* optr) { + assert(dst_width > 0); + assert(dst_height > 0); + int dy = (src_height << 16) / dst_height; + int dx = (src_width << 16) / dst_width; + if ((src_width % 16 != 0) || (src_width > kMaxInputWidth) || + dst_height * 2 > src_height) { + uint8* dst = optr; + int dy = (src_height << 16) / dst_height; + int dx = (src_width << 16) / dst_width; int y = 0; - for (int j = 0; j < oheight; ++j) { + for (int j = 0; j < dst_height; ++j) { int iy = y >> 16; - const uint8 *const src = iptr + iy * istride; + const uint8* const src = src_ptr + iy * src_stride; y += dy; - if (y > (iheight << 16)) { - y = (iheight << 16); + if (y > (src_height << 16)) { + y = (src_height << 16); } int boxheight = (y >> 16) - iy; - ScalePlaneBoxRow(owidth, boxheight, - dx, istride, + ScalePlaneBoxRow(dst_width, boxheight, + dx, src_stride, src, dst); dst += ostride; } } else { ALIGN16(uint16 row[kMaxInputWidth]); - void (*ScaleAddRows)(const uint8* iptr, int32 istride, - uint16* orow, int32 iwidth, int32 iheight); - void (*ScaleAddCols)(int32 owidth, int32 boxheight, int dx, - const uint16 *iptr, uint8 *optr); + void (*ScaleAddRows)(const uint8* src_ptr, int src_stride, + uint16* orow, int src_width, int src_height); + void (*ScaleAddCols)(int dst_width, int boxheight, int dx, + const uint16* src_ptr, uint8* optr); #if defined(HAS_SCALEADDROWS_SSE2) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (istride % 16 == 0) && IS_ALIGNED(iptr, 16) && (iwidth % 16) == 0) { + (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) && (src_width % 16) == 0) { ScaleAddRows = ScaleAddRows_SSE2; } else #endif @@ -2535,16 +2488,16 @@ static void ScalePlaneBox(int32 iwidth, int32 iheight, } int y = 0; - for (int j = 0; j < oheight; ++j) { + for (int j = 0; j < dst_height; ++j) { int iy = y >> 16; - const uint8 *const src = iptr + iy * istride; + const uint8* const src = src_ptr + iy * src_stride; y += dy; - if (y > (iheight << 16)) { - y = (iheight << 16); + if (y > (src_height << 16)) { + y = (src_height << 16); } int boxheight = (y >> 16) - iy; - ScaleAddRows(src, istride, row, iwidth, boxheight); - ScaleAddCols(owidth, boxheight, dx, row, optr); + ScaleAddRows(src, src_stride, row, src_width, boxheight); + ScaleAddCols(dst_width, boxheight, dx, row, optr); optr += ostride; } } @@ -2553,35 +2506,35 @@ static void ScalePlaneBox(int32 iwidth, int32 iheight, /** * Scale plane to/from any dimensions, with interpolation. */ -static void ScalePlaneBilinearSimple(int32 iwidth, int32 iheight, - int32 owidth, int32 oheight, - int32 istride, int32 ostride, - const uint8 *iptr, uint8 *optr) { - uint8 *dst = optr; - int dx = (iwidth << 16) / owidth; - int dy = (iheight << 16) / oheight; - int maxx = ((iwidth - 1) << 16) - 1; - int maxy = ((iheight - 1) << 16) - 1; - int y = (oheight < iheight) ? 32768 : (iheight << 16) / oheight - 32768; - for (int i = 0; i < oheight; ++i) { +static void ScalePlaneBilinearSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int ostride, + const uint8* src_ptr, uint8* optr) { + uint8* dst = optr; + int dx = (src_width << 16) / dst_width; + int dy = (src_height << 16) / dst_height; + int maxx = ((src_width - 1) << 16) - 1; + int maxy = ((src_height - 1) << 16) - 1; + int y = (dst_height < src_height) ? 32768 : (src_height << 16) / dst_height - 32768; + for (int i = 0; i < dst_height; ++i) { int cy = (y < 0) ? 0 : y; int yi = cy >> 16; int yf = cy & 0xffff; - const uint8 *const src = iptr + yi * istride; - int x = (owidth < iwidth) ? 32768 : (iwidth << 16) / owidth - 32768; - for (int j = 0; j < owidth; ++j) { + const uint8* const src = src_ptr + yi * src_stride; + int x = (dst_width < src_width) ? 32768 : (src_width << 16) / dst_width - 32768; + for (int j = 0; j < dst_width; ++j) { int cx = (x < 0) ? 0 : x; int xi = cx >> 16; int xf = cx & 0xffff; int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16; - int r1 = (src[xi + istride] * (65536 - xf) + src[xi + istride + 1] * xf) + int r1 = (src[xi + src_stride] * (65536 - xf) + src[xi + src_stride + 1] * xf) >> 16; *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16; x += dx; if (x > maxx) x = maxx; } - dst += ostride - owidth; + dst += ostride - dst_width; y += dy; if (y > maxy) y = maxy; @@ -2592,33 +2545,33 @@ static void ScalePlaneBilinearSimple(int32 iwidth, int32 iheight, * Scale plane to/from any dimensions, with bilinear * interpolation. */ -static void ScalePlaneBilinear(int32 iwidth, int32 iheight, - int32 owidth, int32 oheight, - int32 istride, int32 ostride, - const uint8 *iptr, uint8 *optr) { - assert(owidth > 0); - assert(oheight > 0); - int dy = (iheight << 16) / oheight; - int dx = (iwidth << 16) / owidth; - if ((iwidth % 8 != 0) || (iwidth > kMaxInputWidth)) { - ScalePlaneBilinearSimple(iwidth, iheight, owidth, oheight, istride, ostride, - iptr, optr); +static void ScalePlaneBilinear(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int ostride, + const uint8* src_ptr, uint8* optr) { + assert(dst_width > 0); + assert(dst_height > 0); + int dy = (src_height << 16) / dst_height; + int dx = (src_width << 16) / dst_width; + if ((src_width % 8 != 0) || (src_width > kMaxInputWidth)) { + ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height, src_stride, ostride, + src_ptr, optr); } else { ALIGN16(uint8 row[kMaxInputWidth + 1]); - void (*ScaleFilterRows)(uint8* optr, const uint8* iptr0, int32 istride, - int owidth, int source_y_fraction); - void (*ScaleFilterCols)(uint8* optr, const uint8* iptr, - int owidth, int dx); + void (*ScaleFilterRows)(uint8* optr, const uint8* iptr0, int src_stride, + int dst_width, int source_y_fraction); + void (*ScaleFilterCols)(uint8* optr, const uint8* src_ptr, + int dst_width, int dx); #if defined(HAS_SCALEFILTERROWS_SSSE3) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (istride % 16 == 0) && IS_ALIGNED(iptr, 16) && (iwidth % 16) == 0) { + (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) && (src_width % 16) == 0) { ScaleFilterRows = ScaleFilterRows_SSSE3; } else #endif #if defined(HAS_SCALEFILTERROWS_SSE2) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (istride % 16 == 0) && IS_ALIGNED(iptr, 16) && (iwidth % 16) == 0) { + (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) && (src_width % 16) == 0) { ScaleFilterRows = ScaleFilterRows_SSE2; } else #endif @@ -2628,13 +2581,13 @@ static void ScalePlaneBilinear(int32 iwidth, int32 iheight, ScaleFilterCols = ScaleFilterCols_C; int y = 0; - int maxy = ((iheight - 1) << 16) - 1; // max is filter of last 2 rows. - for (int j = 0; j < oheight; ++j) { + int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows. + for (int j = 0; j < dst_height; ++j) { int iy = y >> 16; int fy = (y >> 8) & 255; - const uint8 *const src = iptr + iy * istride; - ScaleFilterRows(row, src, istride, iwidth, fy); - ScaleFilterCols(optr, row, owidth, dx); + const uint8* const src = src_ptr + iy * src_stride; + ScaleFilterRows(row, src, src_stride, src_width, fy); + ScaleFilterCols(optr, row, dst_width, dx); optr += ostride; y += dy; if (y > maxy) { @@ -2650,39 +2603,39 @@ static void ScalePlaneBilinear(int32 iwidth, int32 iheight, * of x and dx is the integer part of the source position and * the lower 16 bits are the fixed decimal part. */ -static void ScalePlaneSimple(int32 iwidth, int32 iheight, - int32 owidth, int32 oheight, - int32 istride, int32 ostride, - const uint8 *iptr, uint8 *optr) { - uint8 *dst = optr; - int dx = (iwidth << 16) / owidth; - for (int y = 0; y < oheight; ++y) { - const uint8 *const src = iptr + (y * iheight / oheight) * istride; +static void ScalePlaneSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int ostride, + const uint8* src_ptr, uint8* optr) { + uint8* dst = optr; + int dx = (src_width << 16) / dst_width; + for (int y = 0; y < dst_height; ++y) { + const uint8* const src = src_ptr + (y * src_height / dst_height) * src_stride; // TODO(fbarchard): Round X coordinate by setting x=0x8000. int x = 0; - for (int i = 0; i < owidth; ++i) { + for (int i = 0; i < dst_width; ++i) { *dst++ = src[x >> 16]; x += dx; } - dst += ostride - owidth; + dst += ostride - dst_width; } } /** * Scale plane to/from any dimensions. */ -static void ScalePlaneAnySize(int32 iwidth, int32 iheight, - int32 owidth, int32 oheight, - int32 istride, int32 ostride, - const uint8 *iptr, uint8 *optr, - bool interpolate) { - if (!interpolate) { - ScalePlaneSimple(iwidth, iheight, owidth, oheight, istride, ostride, - iptr, optr); +static void ScalePlaneAnySize(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int ostride, + const uint8* src_ptr, uint8* optr, + FilterMode filtering) { + if (!filtering) { + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride, ostride, + src_ptr, optr); } else { // fall back to non-optimized version - ScalePlaneBilinear(iwidth, iheight, owidth, oheight, istride, ostride, - iptr, optr); + ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, src_stride, ostride, + src_ptr, optr); } } @@ -2694,20 +2647,21 @@ static void ScalePlaneAnySize(int32 iwidth, int32 iheight, * reference implementation for e.g. XGA->LowResPAL * */ -static void ScalePlaneDown(int32 iwidth, int32 iheight, - int32 owidth, int32 oheight, - int32 istride, int32 ostride, - const uint8 *iptr, uint8 *optr, - bool interpolate) { - if (!interpolate) { - ScalePlaneSimple(iwidth, iheight, owidth, oheight, istride, ostride, - iptr, optr); - } else if (iheight * 2 > oheight) { // between 1/2x and 1x use bilinear - ScalePlaneBilinear(iwidth, iheight, owidth, oheight, istride, ostride, - iptr, optr); +static void ScalePlaneDown(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int ostride, + const uint8* src_ptr, uint8* optr, + FilterMode filtering) { + if (!filtering) { + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride, ostride, + src_ptr, optr); + } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) { + // between 1/2x and 1x use bilinear + ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, src_stride, ostride, + src_ptr, optr); } else { - ScalePlaneBox(iwidth, iheight, owidth, oheight, istride, ostride, - iptr, optr); + ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride, ostride, + src_ptr, optr); } } @@ -2719,71 +2673,71 @@ static void ScalePlaneDown(int32 iwidth, int32 iheight, * compared to the reference implementation. * */ -static void CopyPlane(int32 iwidth, int32 iheight, - int32 owidth, int32 oheight, - int32 istride, int32 ostride, - const uint8 *iptr, uint8 *optr) { - if (istride == iwidth && ostride == owidth) { +static void CopyPlane(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int ostride, + const uint8* src_ptr, uint8* optr) { + if (src_stride == src_width && ostride == dst_width) { // All contiguous, so can use REALLY fast path. - memcpy(optr, iptr, iwidth * iheight); + memcpy(optr, src_ptr, src_width * src_height); } else { // Not all contiguous; must copy scanlines individually - const uint8 *src = iptr; - uint8 *dst = optr; - for (int i = 0; i < iheight; ++i) { - memcpy(dst, src, iwidth); + const uint8* src = src_ptr; + uint8* dst = optr; + for (int i = 0; i < src_height; ++i) { + memcpy(dst, src, src_width); dst += ostride; - src += istride; + src += src_stride; } } } -static void ScalePlane(const uint8 *in, int32 istride, - int32 iwidth, int32 iheight, - uint8 *out, int32 ostride, - int32 owidth, int32 oheight, - bool interpolate, bool use_ref) { +static void ScalePlane(const uint8* src, int src_stride, + int src_width, int src_height, + uint8* dst, int ostride, + int dst_width, int dst_height, + FilterMode filtering, bool use_ref) { // Use specialized scales to improve performance for common resolutions. // For example, all the 1/2 scalings will use ScalePlaneDown2() - if (owidth == iwidth && oheight == iheight) { + if (dst_width == src_width && dst_height == src_height) { // Straight copy. - CopyPlane(iwidth, iheight, owidth, oheight, istride, ostride, in, out); - } else if (owidth <= iwidth && oheight <= iheight) { + CopyPlane(src_width, src_height, dst_width, dst_height, src_stride, ostride, src, dst); + } else if (dst_width <= src_width && dst_height <= src_height) { // Scale down. if (use_ref) { // For testing, allow the optimized versions to be disabled. - ScalePlaneDown(iwidth, iheight, owidth, oheight, istride, ostride, - in, out, interpolate); - } else if (4 * owidth == 3 * iwidth && 4 * oheight == 3 * iheight) { + ScalePlaneDown(src_width, src_height, dst_width, dst_height, src_stride, ostride, + src, dst, filtering); + } else if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) { // optimized, 3/4 - ScalePlaneDown34(iwidth, iheight, owidth, oheight, istride, ostride, - in, out, interpolate); - } else if (2 * owidth == iwidth && 2 * oheight == iheight) { + ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride, ostride, + src, dst, filtering); + } else if (2 * dst_width == src_width && 2 * dst_height == src_height) { // optimized, 1/2 - ScalePlaneDown2(iwidth, iheight, owidth, oheight, istride, ostride, - in, out, interpolate); + ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride, ostride, + src, dst, filtering); // 3/8 rounded up for odd sized chroma height. - } else if (8 * owidth == 3 * iwidth && oheight == ((iheight * 3 + 7) / 8)) { + } else if (8 * dst_width == 3 * src_width && dst_height == ((src_height * 3 + 7) / 8)) { // optimized, 3/8 - ScalePlaneDown38(iwidth, iheight, owidth, oheight, istride, ostride, - in, out, interpolate); - } else if (4 * owidth == iwidth && 4 * oheight == iheight) { + ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride, ostride, + src, dst, filtering); + } else if (4 * dst_width == src_width && 4 * dst_height == src_height) { // optimized, 1/4 - ScalePlaneDown4(iwidth, iheight, owidth, oheight, istride, ostride, - in, out, interpolate); - } else if (8 * owidth == iwidth && 8 * oheight == iheight) { + ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride, ostride, + src, dst, filtering); + } else if (8 * dst_width == src_width && 8 * dst_height == src_height) { // optimized, 1/8 - ScalePlaneDown8(iwidth, iheight, owidth, oheight, istride, ostride, - in, out, interpolate); + ScalePlaneDown8(src_width, src_height, dst_width, dst_height, src_stride, ostride, + src, dst, filtering); } else { // Arbitrary downsample - ScalePlaneDown(iwidth, iheight, owidth, oheight, istride, ostride, - in, out, interpolate); + ScalePlaneDown(src_width, src_height, dst_width, dst_height, src_stride, ostride, + src, dst, filtering); } } else { // Arbitrary scale up and/or down. - ScalePlaneAnySize(iwidth, iheight, owidth, oheight, istride, ostride, - in, out, interpolate); + ScalePlaneAnySize(src_width, src_height, dst_width, dst_height, src_stride, ostride, + src, dst, filtering); } } @@ -2794,59 +2748,91 @@ static void ScalePlane(const uint8 *in, int32 istride, * suitable for handling the desired resolutions. * */ -bool Scale(const uint8 *inY, const uint8 *inU, const uint8 *inV, - int32 istrideY, int32 istrideU, int32 istrideV, - int32 iwidth, int32 iheight, - uint8 *outY, uint8 *outU, uint8 *outV, - int32 ostrideY, int32 ostrideU, int32 ostrideV, - int32 owidth, int32 oheight, - bool interpolate) { - if (!inY || !inU || !inV || iwidth <= 0 || iheight <= 0 || - !outY || !outU || !outV || owidth <= 0 || oheight <= 0) { - return false; - } - int32 halfiwidth = (iwidth + 1) >> 1; - int32 halfiheight = (iheight + 1) >> 1; - int32 halfowidth = (owidth + 1) >> 1; - int32 halfoheight = (oheight + 1) >> 1; - ScalePlane(inY, istrideY, iwidth, iheight, - outY, ostrideY, owidth, oheight, - interpolate, use_reference_impl_); - ScalePlane(inU, istrideU, halfiwidth, halfiheight, - outU, ostrideU, halfowidth, halfoheight, - interpolate, use_reference_impl_); - ScalePlane(inV, istrideV, halfiwidth, halfiheight, - outV, ostrideV, halfowidth, halfoheight, - interpolate, use_reference_impl_); - return true; +int I420Scale(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int dst_width, int dst_height, + FilterMode filtering) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height <= 0 || + !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + return -1; + } + int halfiwidth = (src_width + 1) >> 1; + int halfiheight = (src_height + 1) >> 1; + int halfowidth = (dst_width + 1) >> 1; + int halfoheight = (dst_height + 1) >> 1; + + ScalePlane(src_y, src_stride_y, src_width, src_height, + dst_y, dst_stride_y, dst_width, dst_height, + filtering, use_reference_impl_); + ScalePlane(src_u, src_stride_u, halfiwidth, halfiheight, + dst_u, dst_stride_u, halfowidth, halfoheight, + filtering, use_reference_impl_); + ScalePlane(src_v, src_stride_v, halfiwidth, halfiheight, + dst_v, dst_stride_v, halfowidth, halfoheight, + filtering, use_reference_impl_); + return 0; } -bool Scale(const uint8 *in, int32 iwidth, int32 iheight, - uint8 *out, int32 owidth, int32 oheight, int32 ooffset, - bool interpolate) { - if (!in || iwidth <= 0 || iheight <= 0 || - !out || owidth <= 0 || oheight <= 0 || ooffset < 0 || - ooffset >= oheight) { - return false; +int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, + int src_stride_y, int src_stride_u, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, uint8* dst_u, uint8* dst_v, + int dst_stride_y, int dst_stride_u, int dst_stride_v, + int dst_width, int dst_height, + bool interpolate) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height <= 0 || + !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + return -1; + } + int halfiwidth = (src_width + 1) >> 1; + int halfiheight = (src_height + 1) >> 1; + int halfowidth = (dst_width + 1) >> 1; + int halfoheight = (dst_height + 1) >> 1; + FilterMode filtering = interpolate ? kFilterBox : kFilterNone; + + ScalePlane(src_y, src_stride_y, src_width, src_height, + dst_y, dst_stride_y, dst_width, dst_height, + filtering, use_reference_impl_); + ScalePlane(src_u, src_stride_u, halfiwidth, halfiheight, + dst_u, dst_stride_u, halfowidth, halfoheight, + filtering, use_reference_impl_); + ScalePlane(src_v, src_stride_v, halfiwidth, halfiheight, + dst_v, dst_stride_v, halfowidth, halfoheight, + filtering, use_reference_impl_); + return 0; +} + +int Scale(const uint8* src, int src_width, int src_height, + uint8* dst, int dst_width, int dst_height, int ooffset, + bool interpolate) { + if (!src || src_width <= 0 || src_height <= 0 || + !dst || dst_width <= 0 || dst_height <= 0 || ooffset < 0 || + ooffset >= dst_height) { + return -1; } ooffset = ooffset & ~1; // chroma requires offset to multiple of 2. - int32 halfiwidth = (iwidth + 1) >> 1; - int32 halfiheight = (iheight + 1) >> 1; - int32 halfowidth = (owidth + 1) >> 1; - int32 halfoheight = (oheight + 1) >> 1; - int32 aheight = oheight - ooffset * 2; // actual output height - const uint8 *const iyptr = in; - uint8 *oyptr = out + ooffset * owidth; - const uint8 *const iuptr = in + iwidth * iheight; - uint8 *ouptr = out + owidth * oheight + (ooffset >> 1) * halfowidth; - const uint8 *const ivptr = in + iwidth * iheight + + int halfiwidth = (src_width + 1) >> 1; + int halfiheight = (src_height + 1) >> 1; + int halfowidth = (dst_width + 1) >> 1; + int halfoheight = (dst_height + 1) >> 1; + int aheight = dst_height - ooffset * 2; // actual output height + const uint8* const iyptr = src; + uint8* oyptr = dst + ooffset * dst_width; + const uint8* const iuptr = src + src_width * src_height; + uint8* ouptr = dst + dst_width * dst_height + (ooffset >> 1) * halfowidth; + const uint8* const ivptr = src + src_width * src_height + halfiwidth * halfiheight; - uint8 *ovptr = out + owidth * oheight + halfowidth * halfoheight + + uint8* ovptr = dst + dst_width * dst_height + halfowidth * halfoheight + (ooffset >> 1) * halfowidth; - return Scale(iyptr, iuptr, ivptr, iwidth, halfiwidth, halfiwidth, - iwidth, iheight, oyptr, ouptr, ovptr, owidth, - halfowidth, halfowidth, owidth, aheight, interpolate); + return Scale(iyptr, iuptr, ivptr, src_width, halfiwidth, halfiwidth, + src_width, src_height, oyptr, ouptr, ovptr, dst_width, + halfowidth, halfowidth, dst_width, aheight, interpolate); } } // namespace libyuv diff --git a/source/video_common.cc b/source/video_common.cc index 8242316df..8b8ee622d 100644 --- a/source/video_common.cc +++ b/source/video_common.cc @@ -13,8 +13,6 @@ #include -#include "common.h" - namespace libyuv { #define ARRAY_SIZE(x) (static_cast((sizeof(x)/sizeof(x[0])))) @@ -47,20 +45,4 @@ uint32 CanonicalFourCC(uint32 fourcc) { return fourcc; } -std::string VideoFormat::ToString() const { - std::string fourcc_name = GetFourccName(fourcc) + " "; - for (std::string::const_iterator i = fourcc_name.begin(); - i < fourcc_name.end(); ++i) { - // Test character is printable; Avoid isprint() which asserts on negatives - if (*i < 32 || *i >= 127) { - fourcc_name = ""; - break; - } - } - - std::ostringstream ss; - ss << fourcc_name << width << "x" << height << "x" << IntervalToFps(interval); - return ss.str(); -} - } // namespace libyuv diff --git a/source/video_common.h b/source/video_common.h index c936c4cfc..9fe08a03a 100644 --- a/source/video_common.h +++ b/source/video_common.h @@ -18,7 +18,7 @@ #include -#include "basic_types.h" +#include "libyuv/basic_types.h" namespace libyuv { @@ -32,16 +32,6 @@ namespace libyuv { (static_cast(a)) | (static_cast(b) << 8) | \ (static_cast(c) << 16) | (static_cast(d) << 24)) -// Get the name, that is, string with four characters, of a fourcc code. -inline std::string GetFourccName(uint32 fourcc) { - std::string name; - name.push_back(static_cast(fourcc & 0xFF)); - name.push_back(static_cast((fourcc >> 8) & 0xFF)); - name.push_back(static_cast((fourcc >> 16) & 0xFF)); - name.push_back(static_cast((fourcc >> 24) & 0xFF)); - return name; -} - // Some good pages discussing FourCC codes: // http://developer.apple.com/quicktime/icefloe/dispatch020.html // http://www.fourcc.org/yuv.php @@ -87,88 +77,6 @@ enum FourCC { // Converts fourcc aliases into canonical ones. uint32 CanonicalFourCC(uint32 fourcc); -////////////////////////////////////////////////////////////////////////////// -// Definition of VideoFormat. -////////////////////////////////////////////////////////////////////////////// - -static const int64 kNumNanosecsPerSec = 1000000000; - -struct VideoFormat { - static const int64 kMinimumInterval = kNumNanosecsPerSec / 10000; // 10k fps - - VideoFormat() : width(0), height(0), interval(0), fourcc(0) {} - - VideoFormat(int w, int h, int64 interval_ns, uint32 cc) - : width(w), - height(h), - interval(interval_ns), - fourcc(cc) { - } - - VideoFormat(const VideoFormat& format) - : width(format.width), - height(format.height), - interval(format.interval), - fourcc(format.fourcc) { - } - - static int64 FpsToInterval(int fps) { - return fps ? kNumNanosecsPerSec / fps : kMinimumInterval; - } - - static int IntervalToFps(int64 interval) { - // Normalize the interval first. - interval = libyuv::_max(interval, kMinimumInterval); - return static_cast(kNumNanosecsPerSec / interval); - } - - bool operator==(const VideoFormat& format) const { - return width == format.width && height == format.height && - interval == format.interval && fourcc == format.fourcc; - } - - bool operator!=(const VideoFormat& format) const { - return !(*this == format); - } - - bool operator<(const VideoFormat& format) const { - return (fourcc < format.fourcc) || - (fourcc == format.fourcc && width < format.width) || - (fourcc == format.fourcc && width == format.width && - height < format.height) || - (fourcc == format.fourcc && width == format.width && - height == format.height && interval > format.interval); - } - - int framerate() const { return IntervalToFps(interval); } - - // Check if both width and height are 0. - bool IsSize0x0() const { return 0 == width && 0 == height; } - - // Check if this format is less than another one by comparing the resolution - // and frame rate. - bool IsPixelRateLess(const VideoFormat& format) const { - return width * height * framerate() < - format.width * format.height * format.framerate(); - } - - // Get a string presentation in the form of "fourcc width x height x fps" - std::string ToString() const; - - int width; // in number of pixels - int height; // in number of pixels - int64 interval; // in nanoseconds - uint32 fourcc; // color space. FOURCC_ANY means that any color space is OK. -}; - -// Result of video capturer start. -enum CaptureResult { - CR_SUCCESS, // The capturer starts successfully. - CR_PENDING, // The capturer is pending to start the capture device. - CR_FAILURE, // The capturer fails to start. - CR_NO_DEVICE, // The capturer has no device and fails to start. -}; - } // namespace libyuv #endif // LIBYUV_SOURCE_VIDEO_COMMON_H_ diff --git a/unit_test/unit_test.cc b/unit_test/unit_test.cc index 73bb6a8a5..1996adf11 100644 --- a/unit_test/unit_test.cc +++ b/unit_test/unit_test.cc @@ -22,8 +22,7 @@ class libyuvEnvironment : public ::testing::Environment { libyuvTest::libyuvTest() : _rotate_max_w(128), - _rotate_max_h(128) -{ + _rotate_max_h(128) { } void libyuvTest::SetUp() {