mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
Port I420ToARGB to intrinsics for win64
BUG=336 TESTED=out\release_x64\libyuv_unittest --gunit_also_run_disabled_tests --gtest_filter=*I420To*B* R=bryan.bernhart@intel.com, tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/15809005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1018 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
f67b426bdf
commit
e6dd1fa024
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 1017
|
Version: 1018
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -152,6 +152,11 @@ extern "C" {
|
|||||||
#define HAS_YUY2TOYROW_SSE2
|
#define HAS_YUY2TOYROW_SSE2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// The following are available on x64 Visual C:
|
||||||
|
#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64)
|
||||||
|
#define HAS_I422TOARGBROW_SSSE3
|
||||||
|
#endif
|
||||||
|
|
||||||
// GCC >= 4.7.0 required for AVX2.
|
// GCC >= 4.7.0 required for AVX2.
|
||||||
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
|
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
|
||||||
#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
|
#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1017
|
#define LIBYUV_VERSION 1018
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||||
|
|||||||
@ -35,10 +35,12 @@ extern "C" {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAS_I422TOARGBROW_SSSE3
|
#ifdef HAS_I422TOARGBROW_SSSE3
|
||||||
YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
|
|
||||||
0, 4, 7)
|
|
||||||
YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,
|
YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,
|
||||||
1, 4, 7)
|
1, 4, 7)
|
||||||
|
#endif // HAS_I422TOARGBROW_SSSE3
|
||||||
|
#ifdef HAS_I444TOARGBROW_SSSE3
|
||||||
|
YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
|
||||||
|
0, 4, 7)
|
||||||
YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,
|
YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,
|
||||||
2, 4, 7)
|
2, 4, 7)
|
||||||
YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,
|
YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,
|
||||||
@ -59,7 +61,7 @@ YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7)
|
|||||||
YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)
|
YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)
|
||||||
YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)
|
YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)
|
||||||
YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
|
YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
|
||||||
#endif // HAS_I422TOARGBROW_SSSE3
|
#endif // HAS_I444TOARGBROW_SSSE3
|
||||||
#ifdef HAS_I422TOARGBROW_AVX2
|
#ifdef HAS_I422TOARGBROW_AVX2
|
||||||
YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
|
YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
|
||||||
#endif // HAS_I422TOARGBROW_AVX2
|
#endif // HAS_I422TOARGBROW_AVX2
|
||||||
|
|||||||
@ -10,13 +10,177 @@
|
|||||||
|
|
||||||
#include "libyuv/row.h"
|
#include "libyuv/row.h"
|
||||||
|
|
||||||
|
#if defined (_M_X64)
|
||||||
|
#include <emmintrin.h>
|
||||||
|
#include <tmmintrin.h> // For _mm_maddubs_epi16
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
namespace libyuv {
|
namespace libyuv {
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// This module is for Visual C x86.
|
// This module is for Visual C.
|
||||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
|
||||||
|
|
||||||
|
#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
|
||||||
|
|
||||||
|
#define UB 127 /* min(127,(int8)(2.018 * 64)) */
|
||||||
|
#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
|
||||||
|
#define UR 0
|
||||||
|
|
||||||
|
#define VB 0
|
||||||
|
#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
|
||||||
|
#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
|
||||||
|
|
||||||
|
// Bias
|
||||||
|
#define BB UB * 128 + VB * 128
|
||||||
|
#define BG UG * 128 + VG * 128
|
||||||
|
#define BR UR * 128 + VR * 128
|
||||||
|
|
||||||
|
static const vec8 kUVToB = {
|
||||||
|
UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
|
||||||
|
};
|
||||||
|
|
||||||
|
static const vec8 kUVToR = {
|
||||||
|
UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
|
||||||
|
};
|
||||||
|
|
||||||
|
static const vec8 kUVToG = {
|
||||||
|
UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
|
||||||
|
};
|
||||||
|
|
||||||
|
static const vec8 kVUToB = {
|
||||||
|
VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
|
||||||
|
};
|
||||||
|
|
||||||
|
static const vec8 kVUToR = {
|
||||||
|
VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
|
||||||
|
};
|
||||||
|
|
||||||
|
static const vec8 kVUToG = {
|
||||||
|
VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
|
||||||
|
};
|
||||||
|
|
||||||
|
static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
|
||||||
|
static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
|
||||||
|
static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
|
||||||
|
static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
|
||||||
|
static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
|
||||||
|
|
||||||
|
// 64 bit
|
||||||
|
#if defined(_M_X64)
|
||||||
|
|
||||||
|
// Aligned destination version.
|
||||||
|
__declspec(align(16))
|
||||||
|
void I422ToARGBRow_SSSE3(const uint8* y_buf,
|
||||||
|
const uint8* u_buf,
|
||||||
|
const uint8* v_buf,
|
||||||
|
uint8* dst_argb,
|
||||||
|
int width) {
|
||||||
|
|
||||||
|
__m128i xmm0, xmm1, xmm2, xmm3;
|
||||||
|
const __m128i xmm5 = _mm_set1_epi8(-1);
|
||||||
|
const __m128i xmm4 = _mm_setzero_si128();
|
||||||
|
const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
|
||||||
|
|
||||||
|
while (width > 0) {
|
||||||
|
xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
|
||||||
|
xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
|
||||||
|
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
|
||||||
|
xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
|
||||||
|
xmm1 = _mm_load_si128(&xmm0);
|
||||||
|
xmm2 = _mm_load_si128(&xmm0);
|
||||||
|
xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
|
||||||
|
xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
|
||||||
|
xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
|
||||||
|
xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
|
||||||
|
xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
|
||||||
|
xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
|
||||||
|
xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
|
||||||
|
xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
|
||||||
|
xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
|
||||||
|
xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
|
||||||
|
xmm0 = _mm_adds_epi16(xmm0, xmm3);
|
||||||
|
xmm1 = _mm_adds_epi16(xmm1, xmm3);
|
||||||
|
xmm2 = _mm_adds_epi16(xmm2, xmm3);
|
||||||
|
xmm0 = _mm_srai_epi16(xmm0, 6);
|
||||||
|
xmm1 = _mm_srai_epi16(xmm1, 6);
|
||||||
|
xmm2 = _mm_srai_epi16(xmm2, 6);
|
||||||
|
xmm0 = _mm_packus_epi16(xmm0, xmm0);
|
||||||
|
xmm1 = _mm_packus_epi16(xmm1, xmm1);
|
||||||
|
xmm2 = _mm_packus_epi16(xmm2, xmm2);
|
||||||
|
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
|
||||||
|
xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
|
||||||
|
xmm1 = _mm_load_si128(&xmm0);
|
||||||
|
xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
|
||||||
|
xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
|
||||||
|
|
||||||
|
_mm_store_si128((__m128i *)dst_argb, xmm0);
|
||||||
|
_mm_store_si128((__m128i *)(dst_argb + 16), xmm1);
|
||||||
|
|
||||||
|
y_buf += 8;
|
||||||
|
u_buf += 4;
|
||||||
|
dst_argb += 32;
|
||||||
|
width -= 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unaligned destination version.
|
||||||
|
void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||||
|
const uint8* u_buf,
|
||||||
|
const uint8* v_buf,
|
||||||
|
uint8* dst_argb,
|
||||||
|
int width) {
|
||||||
|
|
||||||
|
__m128i xmm0, xmm1, xmm2, xmm3;
|
||||||
|
const __m128i xmm5 = _mm_set1_epi8(-1);
|
||||||
|
const __m128i xmm4 = _mm_setzero_si128();
|
||||||
|
const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
|
||||||
|
|
||||||
|
while (width > 0) {
|
||||||
|
xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
|
||||||
|
xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
|
||||||
|
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
|
||||||
|
xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
|
||||||
|
xmm1 = _mm_load_si128(&xmm0);
|
||||||
|
xmm2 = _mm_load_si128(&xmm0);
|
||||||
|
xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
|
||||||
|
xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
|
||||||
|
xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
|
||||||
|
xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
|
||||||
|
xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
|
||||||
|
xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
|
||||||
|
xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
|
||||||
|
xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
|
||||||
|
xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
|
||||||
|
xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
|
||||||
|
xmm0 = _mm_adds_epi16(xmm0, xmm3);
|
||||||
|
xmm1 = _mm_adds_epi16(xmm1, xmm3);
|
||||||
|
xmm2 = _mm_adds_epi16(xmm2, xmm3);
|
||||||
|
xmm0 = _mm_srai_epi16(xmm0, 6);
|
||||||
|
xmm1 = _mm_srai_epi16(xmm1, 6);
|
||||||
|
xmm2 = _mm_srai_epi16(xmm2, 6);
|
||||||
|
xmm0 = _mm_packus_epi16(xmm0, xmm0);
|
||||||
|
xmm1 = _mm_packus_epi16(xmm1, xmm1);
|
||||||
|
xmm2 = _mm_packus_epi16(xmm2, xmm2);
|
||||||
|
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
|
||||||
|
xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
|
||||||
|
xmm1 = _mm_load_si128(&xmm0);
|
||||||
|
xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
|
||||||
|
xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
|
||||||
|
|
||||||
|
_mm_storeu_si128((__m128i *)dst_argb, xmm0);
|
||||||
|
_mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
|
||||||
|
|
||||||
|
y_buf += 8;
|
||||||
|
u_buf += 4;
|
||||||
|
dst_argb += 32;
|
||||||
|
width -= 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 32 bit
|
||||||
|
#else // defined(_M_X64)
|
||||||
|
|
||||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||||
|
|
||||||
@ -2030,21 +2194,6 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
|||||||
}
|
}
|
||||||
#endif // HAS_ARGBTOYROW_SSSE3
|
#endif // HAS_ARGBTOYROW_SSSE3
|
||||||
|
|
||||||
#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
|
|
||||||
|
|
||||||
#define UB 127 /* min(63,(int8)(2.018 * 64)) */
|
|
||||||
#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
|
|
||||||
#define UR 0
|
|
||||||
|
|
||||||
#define VB 0
|
|
||||||
#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
|
|
||||||
#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
|
|
||||||
|
|
||||||
// Bias
|
|
||||||
#define BB UB * 128 + VB * 128
|
|
||||||
#define BG UG * 128 + VG * 128
|
|
||||||
#define BR UR * 128 + VR * 128
|
|
||||||
|
|
||||||
#ifdef HAS_I422TOARGBROW_AVX2
|
#ifdef HAS_I422TOARGBROW_AVX2
|
||||||
|
|
||||||
static const lvec8 kUVToB_AVX = {
|
static const lvec8 kUVToB_AVX = {
|
||||||
@ -2150,36 +2299,6 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
|
|||||||
|
|
||||||
#ifdef HAS_I422TOARGBROW_SSSE3
|
#ifdef HAS_I422TOARGBROW_SSSE3
|
||||||
|
|
||||||
static const vec8 kUVToB = {
|
|
||||||
UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
|
|
||||||
};
|
|
||||||
|
|
||||||
static const vec8 kUVToR = {
|
|
||||||
UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
|
|
||||||
};
|
|
||||||
|
|
||||||
static const vec8 kUVToG = {
|
|
||||||
UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
|
|
||||||
};
|
|
||||||
|
|
||||||
static const vec8 kVUToB = {
|
|
||||||
VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
|
|
||||||
};
|
|
||||||
|
|
||||||
static const vec8 kVUToR = {
|
|
||||||
VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
|
|
||||||
};
|
|
||||||
|
|
||||||
static const vec8 kVUToG = {
|
|
||||||
VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
|
|
||||||
};
|
|
||||||
|
|
||||||
static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
|
|
||||||
static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
|
|
||||||
static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
|
|
||||||
static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
|
|
||||||
static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
|
|
||||||
|
|
||||||
// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
|
// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
|
||||||
|
|
||||||
// Read 8 UV from 444.
|
// Read 8 UV from 444.
|
||||||
@ -7276,7 +7395,8 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
|||||||
}
|
}
|
||||||
#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
|
#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
|
||||||
|
|
||||||
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
#endif // defined(_M_X64)
|
||||||
|
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user