diff --git a/README.chromium b/README.chromium index 5cff5f2f4..577591410 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 353 +Version: 354 License: BSD License File: LICENSE diff --git a/include/libyuv/basic_types.h b/include/libyuv/basic_types.h index 2a8752f30..bd6683616 100644 --- a/include/libyuv/basic_types.h +++ b/include/libyuv/basic_types.h @@ -65,6 +65,10 @@ typedef signed char int8; defined(__i386__) || defined(_M_IX86) #define CPU_X86 1 #endif +// Detect compiler is for arm. +#if defined(__arm__) || defined(_M_ARM) +#define CPU_ARM 1 +#endif #define ALIGNP(p, t) \ (reinterpret_cast(((reinterpret_cast(p) + \ diff --git a/include/libyuv/compare.h b/include/libyuv/compare.h index 9dbb0c55e..4cea1598f 100644 --- a/include/libyuv/compare.h +++ b/include/libyuv/compare.h @@ -21,7 +21,7 @@ extern "C" { // Compute a hash for specified memory. Seed of 5381 recommended. uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed); -// Sum Square Error - used to compute Mean Square Error or PSNR +// Sum Square Error - used to compute Mean Square Error or PSNR. uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, int count); diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index 96843f989..8eb5eb476 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -12,7 +12,7 @@ #define INCLUDE_LIBYUV_CONVERT_H_ #include "libyuv/basic_types.h" -// TODO(fbarchard): Remove the following headers includes +// TODO(fbarchard): Remove the following headers includes. #include "libyuv/convert_from.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" @@ -22,7 +22,7 @@ namespace libyuv { extern "C" { #endif -// Alias +// Alias. #define I420ToI420 I420Copy // Copy I420 to I420. @@ -112,56 +112,63 @@ int V210ToI420(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_v, int dst_stride_v, int width, int height); -// ARGB little endian (bgra in memory) to I420 +// ARGB little endian (bgra in memory) to I420. int ARGBToI420(const uint8* src_frame, int src_stride_frame, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height); -// BGRA little endian (argb in memory) to I420 +// BGRA little endian (argb in memory) to I420. int BGRAToI420(const uint8* src_frame, int src_stride_frame, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height); -// ABGR little endian (rgba in memory) to I420 +// ABGR little endian (rgba in memory) to I420. int ABGRToI420(const uint8* src_frame, int src_stride_frame, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height); -// RGB little endian (bgr in memory) to I420 +// RGBA little endian (rgba in memory) to I420. +int RGBAToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB little endian (bgr in memory) to I420. int RGB24ToI420(const uint8* src_frame, int src_stride_frame, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height); -// RGB big endian (rgb in memory) to I420 +// RGB big endian (rgb in memory) to I420. int RAWToI420(const uint8* src_frame, int src_stride_frame, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height); -// RGB16 (RGBP fourcc) little endian to I420 +// RGB16 (RGBP fourcc) little endian to I420. int RGB565ToI420(const uint8* src_frame, int src_stride_frame, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height); -// RGB15 (RGBO fourcc) little endian to I420 +// RGB15 (RGBO fourcc) little endian to I420. int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height); -// RGB12 (R444 fourcc) little endian to I420 +// RGB12 (R444 fourcc) little endian to I420. int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, @@ -169,7 +176,7 @@ int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame, int width, int height); #ifdef HAVE_JPEG -// src_width/height provided by capture +// src_width/height provided by capture. // dst_width/height for clipping determine final size. int MJPGToI420(const uint8* sample, size_t sample_size, uint8* dst_y, int dst_stride_y, diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h index cb41e111c..0c397410c 100644 --- a/include/libyuv/convert_argb.h +++ b/include/libyuv/convert_argb.h @@ -28,7 +28,7 @@ namespace libyuv { extern "C" { #endif -// Alias +// Alias. #define ARGBToARGB ARGBCopy // Copy ARGB to ARGB. @@ -112,17 +112,17 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, // uint8* dst_argb, int dst_stride_argb, // int width, int height); -// BGRA little endian (argb in memory) to ARGB +// BGRA little endian (argb in memory) to ARGB. int BGRAToARGB(const uint8* src_frame, int src_stride_frame, uint8* dst_argb, int dst_stride_argb, int width, int height); -// ABGR little endian (rgba in memory) to ARGB +// ABGR little endian (rgba in memory) to ARGB. int ABGRToARGB(const uint8* src_frame, int src_stride_frame, uint8* dst_argb, int dst_stride_argb, int width, int height); -// RGBA little endian (abgr in memory) to ARGB +// RGBA little endian (abgr in memory) to ARGB. int RGBAToARGB(const uint8* src_frame, int src_stride_frame, uint8* dst_argb, int dst_stride_argb, int width, int height); @@ -130,27 +130,27 @@ int RGBAToARGB(const uint8* src_frame, int src_stride_frame, // Deprecated function name. #define BG24ToARGB RGB24ToARGB -// RGB little endian (bgr in memory) to ARGB +// RGB little endian (bgr in memory) to ARGB. int RGB24ToARGB(const uint8* src_frame, int src_stride_frame, uint8* dst_argb, int dst_stride_argb, int width, int height); -// RGB big endian (rgb in memory) to ARGB +// RGB big endian (rgb in memory) to ARGB. int RAWToARGB(const uint8* src_frame, int src_stride_frame, uint8* dst_argb, int dst_stride_argb, int width, int height); -// RGB16 (RGBP fourcc) little endian to ARGB +// RGB16 (RGBP fourcc) little endian to ARGB. int RGB565ToARGB(const uint8* src_frame, int src_stride_frame, uint8* dst_argb, int dst_stride_argb, int width, int height); -// RGB15 (RGBO fourcc) little endian to ARGB +// RGB15 (RGBO fourcc) little endian to ARGB. int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame, uint8* dst_argb, int dst_stride_argb, int width, int height); -// RGB12 (R444 fourcc) little endian to ARGB +// RGB12 (R444 fourcc) little endian to ARGB. int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame, uint8* dst_argb, int dst_stride_argb, int width, int height); @@ -164,7 +164,7 @@ int MJPGToARGB(const uint8* sample, size_t sample_size, int dst_width, int dst_height); #endif -// Note Bayer formats (BGGR) to ARGB are in format_conversion.h +// Note Bayer formats (BGGR) to ARGB are in format_conversion.h. // Convert camera sample to ARGB with cropping, rotation and vertical flip. // "src_size" is needed to parse MJPG. diff --git a/include/libyuv/convert_from.h b/include/libyuv/convert_from.h index c6eb8947e..841421735 100644 --- a/include/libyuv/convert_from.h +++ b/include/libyuv/convert_from.h @@ -19,9 +19,9 @@ namespace libyuv { extern "C" { #endif -// See Also convert.h for conversions from formats to I420 +// See Also convert.h for conversions from formats to I420. -// I420Copy in convert to I420ToI420 +// I420Copy in convert to I420ToI420. int I420ToI422(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, @@ -47,7 +47,7 @@ int I420ToI411(const uint8* src_y, int src_stride_y, uint8* dst_v, int dst_stride_v, int width, int height); -// Copy to I400. Source can be I420,422,444,400,NV12,NV21 +// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21. int I400Copy(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, int width, int height); @@ -92,6 +92,12 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, uint8* dst_argb, int dst_stride_argb, int width, int height); +int I420ToRGBA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height); + int I420ToRGB24(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, @@ -122,7 +128,7 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, uint8* dst_frame, int dst_stride_frame, int width, int height); -// Note Bayer formats (BGGR) To I420 are in format_conversion.h +// Note Bayer formats (BGGR) To I420 are in format_conversion.h. // Convert I420 to specified format. // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the diff --git a/include/libyuv/format_conversion.h b/include/libyuv/format_conversion.h index b91cdd715..1458f9a4b 100644 --- a/include/libyuv/format_conversion.h +++ b/include/libyuv/format_conversion.h @@ -43,7 +43,7 @@ int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer, uint8* dst_v, int dst_stride_v, int width, int height); -// Temporary API mapper +// Temporary API mapper. #define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \ BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f) @@ -79,7 +79,7 @@ int I420ToBayerRGGB(const uint8* src_y, int src_stride_y, uint8* dst_frame, int dst_stride_frame, int width, int height); -// Temporary API mapper +// Temporary API mapper. #define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \ I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f) @@ -107,7 +107,7 @@ int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer, uint8* dst_argb, int dst_stride_argb, int width, int height); -// Temporary API mapper +// Temporary API mapper. #define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f) int BayerToARGB(const uint8* src_bayer, int src_stride_bayer, @@ -132,7 +132,7 @@ int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb, uint8* dst_bayer, int dst_stride_bayer, int width, int height); -// Temporary API mapper +// Temporary API mapper. #define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f) int ARGBToBayer(const uint8* src_argb, int src_stride_argb, diff --git a/include/libyuv/mjpeg_decoder.h b/include/libyuv/mjpeg_decoder.h index c8576e92c..67090cf0b 100644 --- a/include/libyuv/mjpeg_decoder.h +++ b/include/libyuv/mjpeg_decoder.h @@ -13,6 +13,8 @@ #include "libyuv/basic_types.h" +// NOTE: For a simplified public API use convert.h MJPGToI420(). + struct jpeg_common_struct; struct jpeg_decompress_struct; struct jpeg_source_mgr; @@ -85,10 +87,10 @@ class MJpegDecoder { int GetVertSubSampFactor(int component); - // Public for testability + // Public for testability. int GetImageScanlinesPerImcuRow(); - // Public for testability + // Public for testability. int GetComponentScanlinesPerImcuRow(int component); // Width of a component in bytes. diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index e7cd51de7..7d5ce35bc 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -13,7 +13,7 @@ #include "libyuv/basic_types.h" -// TODO(fbarchard): Remove the following headers includes +// TODO(fbarchard): Remove the following headers includes. #include "libyuv/convert.h" #include "libyuv/convert_argb.h" @@ -31,7 +31,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, int width, int height); -// Convert I420 to I400. (calls CopyPlane ignoring u/v) +// Convert I420 to I400. (calls CopyPlane ignoring u/v). int I420ToI400(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, @@ -103,7 +103,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, uint8* dst_y, int dst_stride_y, int width, int height); -// ARGB little endian (bgra in memory) to I422 +// ARGB little endian (bgra in memory) to I422. int ARGBToI422(const uint8* src_frame, int src_stride_frame, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, diff --git a/include/libyuv/rotate.h b/include/libyuv/rotate.h index 8766ec3a2..94a7b1746 100644 --- a/include/libyuv/rotate.h +++ b/include/libyuv/rotate.h @@ -31,7 +31,7 @@ enum RotationMode { kRotateCounterClockwise = 270, }; -// Rotate I420 frame +// Rotate I420 frame. int I420Rotate(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, @@ -40,7 +40,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y, uint8* dst_v, int dst_stride_v, int src_width, int src_height, RotationMode mode); -// Rotate NV12 input and store in I420 +// Rotate NV12 input and store in I420. int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, const uint8* src_uv, int src_stride_uv, uint8* dst_y, int dst_stride_y, diff --git a/include/libyuv/scale.h b/include/libyuv/scale.h index e7e7c3894..b372071ea 100644 --- a/include/libyuv/scale.h +++ b/include/libyuv/scale.h @@ -20,9 +20,9 @@ extern "C" { // Supported filtering enum FilterMode { - kFilterNone = 0, // Point sample; Fastest + kFilterNone = 0, // Point sample; Fastest. kFilterBilinear = 1, // Faster than box, but lower quality scaling down. - kFilterBox = 2 // Highest quality + kFilterBox = 2 // Highest quality. }; // Scale a YUV plane. @@ -52,7 +52,7 @@ int I420Scale(const uint8* src_y, int src_stride_y, int dst_width, int dst_height, FilterMode filtering); -// Legacy API. Deprecated +// Legacy API. Deprecated. int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, int src_stride_y, int src_stride_u, int src_stride_v, int src_width, int src_height, @@ -61,12 +61,12 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, int dst_width, int dst_height, bool interpolate); -// Legacy API. Deprecated +// Legacy API. Deprecated. int ScaleOffset(const uint8* src, int src_width, int src_height, uint8* dst, int dst_width, int dst_height, int dst_yoffset, bool interpolate); -// For testing, allow disabling of optimizations. +// For testing, allow disabling of specialized scalers. void SetUseReferenceImpl(bool use); #ifdef __cplusplus diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 04138749b..93bd54156 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 353 +#define LIBYUV_VERSION 354 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/include/libyuv/video_common.h b/include/libyuv/video_common.h index a3f23b27b..b3df74bda 100644 --- a/include/libyuv/video_common.h +++ b/include/libyuv/video_common.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -// Common definitions for video, including fourcc and VideoFormat +// Common definitions for video, including fourcc and VideoFormat. #ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ // NOLINT #define INCLUDE_LIBYUV_VIDEO_COMMON_H_ @@ -107,7 +107,7 @@ enum FourCCBpp { FOURCC_BPP_UYVY = 16, FOURCC_BPP_M420 = 12, FOURCC_BPP_Q420 = 12, - FOURCC_BPP_V210 = 22, // 22.5 actually + FOURCC_BPP_V210 = 22, // 128 / 6 actually. FOURCC_BPP_24BG = 24, FOURCC_BPP_ARGB = 32, FOURCC_BPP_BGRA = 32, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index dc9ce47fb..565df1c28 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -598,7 +598,7 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y, #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) #define HAS_SETROW_NEON static void SetRow8_NEON(uint8* dst, uint32 v32, int count) { - asm volatile ( + asm volatile ( // NOLINT "vdup.u32 q0, %2 \n" // duplicate 4 ints "1: \n" "subs %1, %1, #16 \n" // 16 bytes per loop @@ -669,7 +669,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width, #define HAS_SETROW_X86 static void SetRow8_X86(uint8* dst, uint32 v32, int width) { size_t width_tmp = static_cast(width); - asm volatile ( + asm volatile ( // NOLINT "shr $0x2,%1 \n" "rep stosl \n" : "+D"(dst), // %0 @@ -683,7 +683,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width, for (int y = 0; y < height; ++y) { size_t width_tmp = static_cast(width); uint32* d = reinterpret_cast(dst); - asm volatile ( + asm volatile ( // NOLINT "rep stosl \n" : "+D"(d), // %0 "+c"(width_tmp) // %1 @@ -1176,17 +1176,6 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, return 0; } -#if !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \ - (defined(__x86_64__) || defined(__i386__))) -#define HAS_SCALEARGBFILTERROWS_SSSE3 -#endif -void ScaleARGBFilterRows_C(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction); -void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction); - // Interpolate 2 ARGB images by specified amount (0 to 255). int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, const uint8* src_argb1, int src_stride_argb1, @@ -1201,24 +1190,20 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*ScaleARGBFilterRows)(uint8* dst_ptr, const uint8* src_ptr, + void (*ARGBInterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = ScaleARGBFilterRows_C; -#if defined(HAS_SCALEARGBFILTERROWS_SSSE3) + int source_y_fraction) = ARGBInterpolateRow_C; +#if defined(HAS_ARGBINTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3; + ARGBInterpolateRow = ARGBInterpolateRow_SSSE3; } #endif - uint8 last16[16]; for (int y = 0; y < height; ++y) { - // Filter extrudes edge for its scaling purpose. - memcpy(last16, dst_argb + width * 4, 16); // Save last 16 beyond end. - ScaleARGBFilterRows(dst_argb, src_argb0, src_argb1 - src_argb0, - width, interpolation); - memcpy(dst_argb + width * 4, last16, 16); // Restore last 16 beyond end. + ARGBInterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0, + width, interpolation); src_argb0 += src_stride_argb0; src_argb1 += src_stride_argb1; dst_argb += dst_stride_argb; diff --git a/source/row.h b/source/row.h index 1047e8660..186a6fa88 100644 --- a/source/row.h +++ b/source/row.h @@ -83,9 +83,7 @@ extern "C" { #define HAS_CUMULATIVESUMTOAVERAGE_SSE2 #define HAS_ARGBSHADE_SSE2 #define HAS_ARGBAFFINEROW_SSE2 -// HAS_ARGBBLENDROW_SSE2 may be faster than SSSE3 version on some CPUs, so -// enable it here instead of LIBYUV_SSSE3_ONLY section. -#define HAS_ARGBBLENDROW_SSE2 +#define HAS_ARGBINTERPOLATEROW_SSSE3 #endif // The following are Windows only: @@ -102,6 +100,7 @@ extern "C" { !defined(LIBYUV_SSSE3_ONLY) #define HAS_MIRRORROW_SSE2 #define HAS_ARGBATTENUATE_SSE2 +#define HAS_ARGBBLENDROW_SSE2 #endif // The following are available on Neon platforms @@ -553,6 +552,13 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, uint8* dst_argb, const float* uv_dudv, int width); +void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, + int dst_width, int source_y_fraction); +void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_common.cc b/source/row_common.cc index 83d31a793..e737495e7 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1081,6 +1081,29 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, } } +// C version 2x2 -> 2x1. +void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8* src_ptr1 = src_ptr + src_stride; + uint8* end = dst_ptr + (dst_width << 2); + do { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; + dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8; + dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8; + dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8; + dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8; + dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8; + dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8; + src_ptr += 8; + src_ptr1 += 8; + dst_ptr += 8; + } while (dst_ptr < end); +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_posix.cc b/source/row_posix.cc index 0a9240d89..2da4dfad7 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -3560,6 +3560,71 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, } #endif // HAS_ARGBAFFINEROW_SSE2 +// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version +void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + asm volatile ( + "sub %1,%0 \n" + "shr %3 \n" + "cmp $0x0,%3 \n" + "je 2f \n" + "cmp $0x40,%3 \n" + "je 3f \n" + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x80,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm1 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + "jmp 4f \n" + ".p2align 4 \n" + "2: \n" + "movdqa (%1),%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 2b \n" + "jmp 4f \n" + ".p2align 4 \n" + "3: \n" + "movdqa (%1),%%xmm0 \n" + "pavgb (%1,%4,1),%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 3b \n" + "4: \n" + ".p2align 4 \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"(static_cast(src_stride)) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm5" +#endif + ); +} + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/row_win.cc b/source/row_win.cc index 1afe052a7..b8970bf58 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -3664,6 +3664,81 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, } #endif // HAS_ARGBAFFINEROW_SSE2 +// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version. +__declspec(naked) __declspec(align(16)) +void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi + shr eax, 1 + cmp eax, 0 + je xloop1 + cmp eax, 64 + je xloop2 + movd xmm0, eax // high fraction 0..127 + neg eax + add eax, 128 + movd xmm5, eax // low fraction 128..1 + punpcklbw xmm5, xmm0 + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + + align 16 + xloop: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddubsw xmm0, xmm5 + pmaddubsw xmm1, xmm5 + psrlw xmm0, 7 + psrlw xmm1, 7 + packuswb xmm0, xmm1 + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop + + pop edi + pop esi + ret + + align 16 + xloop1: + movdqa xmm0, [esi] + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop1 + + pop edi + pop esi + ret + + align 16 + xloop2: + movdqa xmm0, [esi] + pavgb xmm0, [esi + edx] + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop2 + + pop edi + pop esi + ret + } +} + #endif // _M_IX86 #ifdef __cplusplus diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index f12206022..109b9803f 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -80,7 +80,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) { \ } #define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B) \ - TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ,) \ + TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, , +) \ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, Invert, -) TESTPLANARTOB(I420, 2, 2, ARGB, 4) @@ -151,7 +151,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) { \ } #define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ,) \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, , +) \ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, Invert, -) TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4) @@ -233,7 +233,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N##_OptVsC) { \ } #define TESTATOPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ - TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, ,) \ + TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, , +) \ TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, Invert, -) TESTATOPLANAR(ARGB, 4, I420, 2, 2) @@ -293,7 +293,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##N##_OptVsC) { \ free_aligned_buffer_16(dst_argb_opt) \ } #define TESTATOB(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B) \ - TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, ,) \ + TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, , +) \ TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, Invert, -) TESTATOB(ARGB, 4, 4, ARGB, 4) @@ -853,14 +853,9 @@ TEST_F(libyuvTest, TestShade) { } TEST_F(libyuvTest, TestInterpolate) { - // Interpolate internally used bilinear filtering, which duplicates the last - // value, but the interpolate saves and restores it. The buffer must be - // padded by 16 extra bytes. TODO(fbarchard): Reimplement interpolate with - // code that does not duplicate the last value and remove kPad. - const int kPad = 16; SIMD_ALIGNED(uint8 orig_pixels_0[256][4]); SIMD_ALIGNED(uint8 orig_pixels_1[256][4]); - SIMD_ALIGNED(uint8 interpolate_pixels[256 + kPad][4]); + SIMD_ALIGNED(uint8 interpolate_pixels[256][4]); orig_pixels_0[0][0] = 16u; orig_pixels_0[0][1] = 32u; @@ -930,7 +925,7 @@ TEST_F(libyuvTest, TestInterpolate) { EXPECT_EQ(16u, interpolate_pixels[0][2]); EXPECT_EQ(32u, interpolate_pixels[0][3]); - for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) { + for (int i = 0; i < benchmark_iterations_ * (1280 * 720 / 256); ++i) { ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0, &interpolate_pixels[0][0], 0, 256, 1, 128); } diff --git a/unit_test/version_test.cc b/unit_test/version_test.cc index 9eb9c201b..a0a8daf76 100644 --- a/unit_test/version_test.cc +++ b/unit_test/version_test.cc @@ -25,7 +25,9 @@ TEST_F(libyuvTest, TestVersion) { printf("LIBYUV_VERSION %d\n", LIBYUV_VERSION); #ifdef LIBYUV_SVNREVISION const char *ver = strchr(LIBYUV_SVNREVISION, ':'); - if (!ver) { + if (ver) { + ++ver; + } else { ver = LIBYUV_SVNREVISION; } int svn_revision = atoi(ver);