mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 09:16:48 +08:00
port YToARGB AVX2 to GCC
BUG=393 TESTED=untested R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/39819004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1262 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
f7e5b5e361
commit
baafc97d6b
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1261
|
||||
Version: 1262
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -203,6 +203,7 @@ extern "C" {
|
||||
#define HAS_UYVYTOUV422ROW_AVX2
|
||||
#define HAS_UYVYTOUVROW_AVX2
|
||||
#define HAS_UYVYTOYROW_AVX2
|
||||
#define HAS_YTOARGBROW_AVX2
|
||||
#define HAS_YUY2TOUV422ROW_AVX2
|
||||
#define HAS_YUY2TOUVROW_AVX2
|
||||
#define HAS_YUY2TOYROW_AVX2
|
||||
@ -217,7 +218,6 @@ extern "C" {
|
||||
|
||||
// The following are available require VS2012. Port to GCC.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
|
||||
#define HAS_YTOARGBROW_AVX2
|
||||
// TODO(fbarchard): fix AVX2 versions of YUV conversion. bug=393
|
||||
#define HAS_I422TOABGRROW_AVX2
|
||||
#define HAS_I422TOARGBROW_AVX2
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1261
|
||||
#define LIBYUV_VERSION 1262
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -2292,9 +2292,7 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
|
||||
#endif // HAS_I422TORGBAROW_AVX2
|
||||
|
||||
#ifdef HAS_YTOARGBROW_SSE2
|
||||
void YToARGBRow_SSE2(const uint8* y_buf,
|
||||
uint8* dst_argb,
|
||||
int width) {
|
||||
void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
|
||||
asm volatile (
|
||||
"mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
|
||||
"movd %%eax,%%xmm2 \n"
|
||||
@ -2340,6 +2338,55 @@ void YToARGBRow_SSE2(const uint8* y_buf,
|
||||
}
|
||||
#endif // HAS_YTOARGBROW_SSE2
|
||||
|
||||
#ifdef HAS_YTOARGBROW_AVX2
|
||||
// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
|
||||
// note: vpunpcklbw mutates and vpackuswb unmutates.
|
||||
void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
|
||||
asm volatile (
|
||||
"mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
|
||||
"vmovd %%eax,%%xmm2 \n"
|
||||
"vbroadcastss %%xmm2,%%ymm2 \n"
|
||||
"mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
|
||||
"vmovd %%eax,%%xmm3 \n"
|
||||
"vbroadcastss %%xmm3,%%ymm3 \n"
|
||||
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpslld $0x18,%%ymm4,%%ymm4 \n"
|
||||
\n"
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
// Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
|
||||
"vmovdqu (%0),%%xmm0 \n"
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
|
||||
"vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
|
||||
"vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
|
||||
"vpsrlw $0x6,%%ymm0,%%ymm0 \n"
|
||||
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
|
||||
"vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
|
||||
"vpermq $0xd8,%%ymm1,%%ymm1 \n"
|
||||
"vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
|
||||
"vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
|
||||
"vpor %%ymm4,%%ymm0,%%ymm0 \n"
|
||||
"vpor %%ymm4,%%ymm1,%%ymm1 \n"
|
||||
"vmovdqu %%ymm0,(%1) \n"
|
||||
"vmovdqu %%ymm1,0x20(%1) \n"
|
||||
"lea 0x40(%1),%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(y_buf), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+rm"(width) // %2
|
||||
:
|
||||
: "memory", "cc", "eax"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif // HAS_YTOARGBROW_AVX2
|
||||
|
||||
#ifdef HAS_MIRRORROW_SSSE3
|
||||
// Shuffle table for reversing the bytes.
|
||||
static uvec8 kShuffleMirror = {
|
||||
|
||||
@ -2354,14 +2354,14 @@ void YToARGBRow_AVX2(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
__asm {
|
||||
vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
|
||||
vpslld ymm4, ymm4, 24
|
||||
mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
|
||||
vmovd xmm3, eax
|
||||
vbroadcastss ymm3, xmm3
|
||||
mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
|
||||
vmovd xmm2, eax
|
||||
vbroadcastss ymm2, xmm2
|
||||
mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
|
||||
vmovd xmm3, eax
|
||||
vbroadcastss ymm3, xmm3
|
||||
vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
|
||||
vpslld ymm4, ymm4, 24
|
||||
|
||||
mov eax, [esp + 4] // Y
|
||||
mov edx, [esp + 8] // rgb
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user