diff --git a/README.chromium b/README.chromium index ecbcffd85..297e501ae 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1215 +Version: 1216 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index cc4344f41..7a58615a5 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -96,8 +96,7 @@ extern "C" { #define HAS_I422TOUYVYROW_SSE2 #define HAS_I422TOYUY2ROW_SSE2 #define HAS_I444TOARGBROW_SSSE3 -// TODO(fbarchard): Implement SSSE3 version of J422ToARGB -//#define HAS_J422TOARGBROW_SSSE3 +// #define HAS_J422TOARGBROW_SSSE3 #define HAS_MERGEUVROW_SSE2 #define HAS_MIRRORROW_SSE2 #define HAS_MIRRORROW_SSSE3 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 906f62772..eb65e8fa2 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1215 +#define LIBYUV_VERSION 1216 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_any.cc b/source/row_any.cc index 253436993..bf5455a51 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -65,7 +65,7 @@ YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15) #ifdef HAS_J422TOARGBROW_SSSE3 YANY(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, J422ToARGBRow_C, 1, 4, 7) -#endif // HAS_I422TOARGBROW_SSSE3 +#endif // HAS_J422TOARGBROW_SSSE3 #ifdef HAS_I422TOARGBROW_AVX2 YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15) #endif // HAS_I422TOARGBROW_AVX2 diff --git a/source/row_posix.cc b/source/row_posix.cc index be1a7c163..be070188a 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -881,11 +881,6 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { #endif // HAS_ARGBTOYJROW_AVX2 #ifdef HAS_ARGBTOUVROW_SSSE3 -// TODO(fbarchard): pass xmm constants to single block of assembly. -// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes -// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers, -// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around -// and considered unsafe. void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( @@ -1523,20 +1518,20 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, } #ifdef HAS_I422TOARGBROW_SSSE3 -#define UB 127 /* min(63,(int8)(2.018 * 64)) */ -#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ +#define YG 74 /* (int8)round(1.164 * 64 + 0.5) */ + +#define UB 127 /* min(63,(int8)round(2.018 * 64)) */ +#define UG -25 /* (int8)round(-0.391 * 64 - 0.5) */ #define UR 0 #define VB 0 -#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ -#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ +#define VG -52 /* (int8)round(-0.813 * 64 - 0.5) */ +#define VR 102 /* (int8)round(1.596 * 64 + 0.5) */ // Bias -#define BB UB * 128 + VB * 128 -#define BG UG * 128 + VG * 128 -#define BR UR * 128 + VR * 128 - -#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ +#define BB (UB * 128 + VB * 128 + YG * 16) +#define BG (UG * 128 + VG * 128 + YG * 16) +#define BR (UR * 128 + VR * 128 + YG * 16) struct { vec8 kUVToB; // 0 @@ -1545,11 +1540,10 @@ struct { vec16 kUVBiasB; // 48 vec16 kUVBiasG; // 64 vec16 kUVBiasR; // 80 - vec16 kYSub16; // 96 - vec16 kYToRgb; // 112 - vec8 kVUToB; // 128 - vec8 kVUToG; // 144 - vec8 kVUToR; // 160 + vec16 kYToRgb; // 96 + vec8 kVUToB; // 112 + vec8 kVUToG; // 128 + vec8 kVUToR; // 144 } static SIMD_ALIGNED(kYuvConstants) = { { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB }, { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, @@ -1557,7 +1551,6 @@ struct { { BB, BB, BB, BB, BB, BB, BB, BB }, { BG, BG, BG, BG, BG, BG, BG, BG }, { BR, BR, BR, BR, BR, BR, BR, BR }, - { 16, 16, 16, 16, 16, 16, 16, 16 }, { YG, YG, YG, YG, YG, YG, YG, YG }, { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB }, { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, @@ -1607,8 +1600,7 @@ struct { "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ "punpcklbw %%xmm4,%%xmm3 \n" \ - "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ - "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \ + "pmullw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ "paddsw %%xmm3,%%xmm0 \n" \ "paddsw %%xmm3,%%xmm1 \n" \ "paddsw %%xmm3,%%xmm2 \n" \ @@ -1623,17 +1615,16 @@ struct { #define YVUTORGB \ "movdqa %%xmm0,%%xmm1 \n" \ "movdqa %%xmm0,%%xmm2 \n" \ - "pmaddubsw " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0 \n" \ - "pmaddubsw " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1 \n" \ - "pmaddubsw " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2 \n" \ + "pmaddubsw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm0 \n" \ + "pmaddubsw " MEMACCESS2(128, [kYuvConstants]) ",%%xmm1 \n" \ + "pmaddubsw " MEMACCESS2(144, [kYuvConstants]) ",%%xmm2 \n" \ "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \ "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \ "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \ "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ "punpcklbw %%xmm4,%%xmm3 \n" \ - "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ - "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \ + "pmullw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ "paddsw %%xmm3,%%xmm0 \n" \ "paddsw %%xmm3,%%xmm1 \n" \ "paddsw %%xmm3,%%xmm2 \n" \ @@ -1767,7 +1758,7 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, [dst_raw]"+r"(dst_raw), // %[dst_raw] // TODO(fbarchard): Make width a register for 32 bit. #if defined(__APPLE__) && defined(__i386__) - [width]"+m"(width) // %[width] + [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] #endif @@ -2059,8 +2050,7 @@ struct { lvec16 kUVBiasB_AVX; // 96 lvec16 kUVBiasG_AVX; // 128 lvec16 kUVBiasR_AVX; // 160 - lvec16 kYSub16_AVX; // 192 - lvec16 kYToRgb_AVX; // 224 + lvec16 kYToRgb_AVX; // 192 } static SIMD_ALIGNED(kYuvConstants_AVX) = { { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB }, @@ -2074,8 +2064,6 @@ struct { BG, BG, BG, BG, BG, BG, BG, BG }, { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { 16, 16, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16, 16 }, { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } }; @@ -2102,8 +2090,7 @@ struct { "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ "vpunpcklbw %%ymm4,%%ymm3,%%ymm3 \n" \ - "vpsubsw " MEMACCESS2(192, [kYuvConstants]) ",%%ymm3,%%ymm3 \n" \ - "vpmullw " MEMACCESS2(224, [kYuvConstants]) ",%%ymm3,%%ymm3 \n" \ + "vpmullw " MEMACCESS2(192, [kYuvConstants]) ",%%ymm3,%%ymm3 \n" \ "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \ "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \ "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \ diff --git a/source/row_win.cc b/source/row_win.cc index 31af591c9..b861b2e9b 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -24,20 +24,20 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ (defined(_M_IX86) || defined(_M_X64)) -#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ +#define YG 74 /* (int8)round(1.164 * 64 + 0.5) */ -#define UB 127 /* min(127,(int8)(2.018 * 64)) */ -#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ +#define UB 127 /* min(63,(int8)round(2.018 * 64)) */ +#define UG -25 /* (int8)round(-0.391 * 64 - 0.5) */ #define UR 0 #define VB 0 -#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ -#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ +#define VG -52 /* (int8)round(-0.813 * 64 - 0.5) */ +#define VR 102 /* (int8)round(1.596 * 64 + 0.5) */ // Bias -#define BB UB * 128 + VB * 128 -#define BG UG * 128 + VG * 128 -#define BR UR * 128 + VR * 128 +#define BB (UB * 128 + VB * 128 + YG * 16) +#define BG (UG * 128 + VG * 128 + YG * 16) +#define BR (UR * 128 + VR * 128 + YG * 16) static const vec8 kUVToB = { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB @@ -64,7 +64,6 @@ static const vec8 kVUToG = { }; static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; -static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; @@ -98,7 +97,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR); xmm3 = _mm_loadl_epi64((__m128i*)y_buf); xmm3 = _mm_unpacklo_epi8(xmm3, xmm4); - xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16); xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb); xmm0 = _mm_adds_epi16(xmm0, xmm3); xmm1 = _mm_adds_epi16(xmm1, xmm3); @@ -1489,9 +1487,6 @@ static const lvec8 kUVToG_AVX = { static const lvec16 kYToRgb_AVX = { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }; -static const lvec16 kYSub16_AVX = { - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 -}; static const lvec16 kUVBiasB_AVX = { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }; @@ -1527,7 +1522,6 @@ static const lvec16 kUVBiasR_AVX = { __asm lea eax, [eax + 16] \ __asm vpermq ymm3, ymm3, 0xd8 \ __asm vpunpcklbw ymm3, ymm3, ymm4 \ - __asm vpsubsw ymm3, ymm3, kYSub16_AVX \ __asm vpmullw ymm3, ymm3, kYToRgb_AVX \ __asm vpaddsw ymm0, ymm0, ymm3 /* B += Y */ \ __asm vpaddsw ymm1, ymm1, ymm3 /* G += Y */ \ @@ -1727,7 +1721,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, } #endif // HAS_I422TOABGRROW_AVX2 -#ifdef HAS_I422TOARGBROW_SSSE3 +#if defined(HAS_I422TOARGBROW_SSSE3) // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. // Read 8 UV from 444. @@ -1781,7 +1775,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ __asm lea eax, [eax + 8] \ __asm punpcklbw xmm3, xmm4 \ - __asm psubsw xmm3, kYSub16 \ __asm pmullw xmm3, kYToRgb \ __asm paddsw xmm0, xmm3 /* B += Y */ \ __asm paddsw xmm1, xmm3 /* G += Y */ \ @@ -1809,7 +1802,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ __asm lea eax, [eax + 8] \ __asm punpcklbw xmm3, xmm4 \ - __asm psubsw xmm3, kYSub16 \ __asm pmullw xmm3, kYToRgb \ __asm paddsw xmm0, xmm3 /* B += Y */ \ __asm paddsw xmm1, xmm3 /* G += Y */ \