mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Port I420ToARGB to intrinsics for win64
BUG=336 TESTED=out\release_x64\libyuv_unittest --gunit_also_run_disabled_tests --gtest_filter=*I420To*B* R=bryan.bernhart@intel.com, tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/15809005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1018 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
f67b426bdf
commit
e6dd1fa024
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1017
|
||||
Version: 1018
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -152,6 +152,11 @@ extern "C" {
|
||||
#define HAS_YUY2TOYROW_SSE2
|
||||
#endif
|
||||
|
||||
// The following are available on x64 Visual C:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64)
|
||||
#define HAS_I422TOARGBROW_SSSE3
|
||||
#endif
|
||||
|
||||
// GCC >= 4.7.0 required for AVX2.
|
||||
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
|
||||
#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1017
|
||||
#define LIBYUV_VERSION 1018
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -35,10 +35,12 @@ extern "C" {
|
||||
}
|
||||
|
||||
#ifdef HAS_I422TOARGBROW_SSSE3
|
||||
YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
|
||||
0, 4, 7)
|
||||
YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,
|
||||
1, 4, 7)
|
||||
#endif // HAS_I422TOARGBROW_SSSE3
|
||||
#ifdef HAS_I444TOARGBROW_SSSE3
|
||||
YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
|
||||
0, 4, 7)
|
||||
YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,
|
||||
2, 4, 7)
|
||||
YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,
|
||||
@ -59,7 +61,7 @@ YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7)
|
||||
YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)
|
||||
YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)
|
||||
YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
|
||||
#endif // HAS_I422TOARGBROW_SSSE3
|
||||
#endif // HAS_I444TOARGBROW_SSSE3
|
||||
#ifdef HAS_I422TOARGBROW_AVX2
|
||||
YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
|
||||
#endif // HAS_I422TOARGBROW_AVX2
|
||||
|
||||
@ -10,13 +10,177 @@
|
||||
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#if defined (_M_X64)
|
||||
#include <emmintrin.h>
|
||||
#include <tmmintrin.h> // For _mm_maddubs_epi16
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for Visual C x86.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||||
// This module is for Visual C.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
|
||||
|
||||
#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
|
||||
|
||||
#define UB 127 /* min(127,(int8)(2.018 * 64)) */
|
||||
#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
|
||||
#define UR 0
|
||||
|
||||
#define VB 0
|
||||
#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
|
||||
#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
|
||||
|
||||
// Bias
|
||||
#define BB UB * 128 + VB * 128
|
||||
#define BG UG * 128 + VG * 128
|
||||
#define BR UR * 128 + VR * 128
|
||||
|
||||
static const vec8 kUVToB = {
|
||||
UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
|
||||
};
|
||||
|
||||
static const vec8 kUVToR = {
|
||||
UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
|
||||
};
|
||||
|
||||
static const vec8 kUVToG = {
|
||||
UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
|
||||
};
|
||||
|
||||
static const vec8 kVUToB = {
|
||||
VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
|
||||
};
|
||||
|
||||
static const vec8 kVUToR = {
|
||||
VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
|
||||
};
|
||||
|
||||
static const vec8 kVUToG = {
|
||||
VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
|
||||
};
|
||||
|
||||
static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
|
||||
static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
|
||||
static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
|
||||
static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
|
||||
static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
|
||||
|
||||
// 64 bit
|
||||
#if defined(_M_X64)
|
||||
|
||||
// Aligned destination version.
|
||||
__declspec(align(16))
|
||||
void I422ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* dst_argb,
|
||||
int width) {
|
||||
|
||||
__m128i xmm0, xmm1, xmm2, xmm3;
|
||||
const __m128i xmm5 = _mm_set1_epi8(-1);
|
||||
const __m128i xmm4 = _mm_setzero_si128();
|
||||
const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
|
||||
|
||||
while (width > 0) {
|
||||
xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
|
||||
xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
|
||||
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
|
||||
xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
|
||||
xmm1 = _mm_load_si128(&xmm0);
|
||||
xmm2 = _mm_load_si128(&xmm0);
|
||||
xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
|
||||
xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
|
||||
xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
|
||||
xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
|
||||
xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
|
||||
xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
|
||||
xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
|
||||
xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
|
||||
xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
|
||||
xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
|
||||
xmm0 = _mm_adds_epi16(xmm0, xmm3);
|
||||
xmm1 = _mm_adds_epi16(xmm1, xmm3);
|
||||
xmm2 = _mm_adds_epi16(xmm2, xmm3);
|
||||
xmm0 = _mm_srai_epi16(xmm0, 6);
|
||||
xmm1 = _mm_srai_epi16(xmm1, 6);
|
||||
xmm2 = _mm_srai_epi16(xmm2, 6);
|
||||
xmm0 = _mm_packus_epi16(xmm0, xmm0);
|
||||
xmm1 = _mm_packus_epi16(xmm1, xmm1);
|
||||
xmm2 = _mm_packus_epi16(xmm2, xmm2);
|
||||
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
|
||||
xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
|
||||
xmm1 = _mm_load_si128(&xmm0);
|
||||
xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
|
||||
xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
|
||||
|
||||
_mm_store_si128((__m128i *)dst_argb, xmm0);
|
||||
_mm_store_si128((__m128i *)(dst_argb + 16), xmm1);
|
||||
|
||||
y_buf += 8;
|
||||
u_buf += 4;
|
||||
dst_argb += 32;
|
||||
width -= 8;
|
||||
}
|
||||
}
|
||||
|
||||
// Unaligned destination version.
|
||||
void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* dst_argb,
|
||||
int width) {
|
||||
|
||||
__m128i xmm0, xmm1, xmm2, xmm3;
|
||||
const __m128i xmm5 = _mm_set1_epi8(-1);
|
||||
const __m128i xmm4 = _mm_setzero_si128();
|
||||
const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
|
||||
|
||||
while (width > 0) {
|
||||
xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
|
||||
xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
|
||||
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
|
||||
xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
|
||||
xmm1 = _mm_load_si128(&xmm0);
|
||||
xmm2 = _mm_load_si128(&xmm0);
|
||||
xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
|
||||
xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
|
||||
xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
|
||||
xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
|
||||
xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
|
||||
xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
|
||||
xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
|
||||
xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
|
||||
xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
|
||||
xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
|
||||
xmm0 = _mm_adds_epi16(xmm0, xmm3);
|
||||
xmm1 = _mm_adds_epi16(xmm1, xmm3);
|
||||
xmm2 = _mm_adds_epi16(xmm2, xmm3);
|
||||
xmm0 = _mm_srai_epi16(xmm0, 6);
|
||||
xmm1 = _mm_srai_epi16(xmm1, 6);
|
||||
xmm2 = _mm_srai_epi16(xmm2, 6);
|
||||
xmm0 = _mm_packus_epi16(xmm0, xmm0);
|
||||
xmm1 = _mm_packus_epi16(xmm1, xmm1);
|
||||
xmm2 = _mm_packus_epi16(xmm2, xmm2);
|
||||
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
|
||||
xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
|
||||
xmm1 = _mm_load_si128(&xmm0);
|
||||
xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
|
||||
xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
|
||||
|
||||
_mm_storeu_si128((__m128i *)dst_argb, xmm0);
|
||||
_mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
|
||||
|
||||
y_buf += 8;
|
||||
u_buf += 4;
|
||||
dst_argb += 32;
|
||||
width -= 8;
|
||||
}
|
||||
}
|
||||
// 32 bit
|
||||
#else // defined(_M_X64)
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
|
||||
@ -2030,21 +2194,6 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
}
|
||||
#endif // HAS_ARGBTOYROW_SSSE3
|
||||
|
||||
#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
|
||||
|
||||
#define UB 127 /* min(63,(int8)(2.018 * 64)) */
|
||||
#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
|
||||
#define UR 0
|
||||
|
||||
#define VB 0
|
||||
#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
|
||||
#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
|
||||
|
||||
// Bias
|
||||
#define BB UB * 128 + VB * 128
|
||||
#define BG UG * 128 + VG * 128
|
||||
#define BR UR * 128 + VR * 128
|
||||
|
||||
#ifdef HAS_I422TOARGBROW_AVX2
|
||||
|
||||
static const lvec8 kUVToB_AVX = {
|
||||
@ -2150,36 +2299,6 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
|
||||
|
||||
#ifdef HAS_I422TOARGBROW_SSSE3
|
||||
|
||||
static const vec8 kUVToB = {
|
||||
UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
|
||||
};
|
||||
|
||||
static const vec8 kUVToR = {
|
||||
UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
|
||||
};
|
||||
|
||||
static const vec8 kUVToG = {
|
||||
UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
|
||||
};
|
||||
|
||||
static const vec8 kVUToB = {
|
||||
VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
|
||||
};
|
||||
|
||||
static const vec8 kVUToR = {
|
||||
VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
|
||||
};
|
||||
|
||||
static const vec8 kVUToG = {
|
||||
VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
|
||||
};
|
||||
|
||||
static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
|
||||
static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
|
||||
static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
|
||||
static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
|
||||
static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
|
||||
|
||||
// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
|
||||
|
||||
// Read 8 UV from 444.
|
||||
@ -7276,7 +7395,8 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
||||
}
|
||||
#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||||
#endif // defined(_M_X64)
|
||||
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user