I400ToARGB_AVX2 port from SSE2 to AVX2.

BUG=403
TESTED=libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=*I400ToARGB*
R=brucedawson@google.com

Review URL: https://webrtc-codereview.appspot.com/46569004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1322 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
fbarchard@google.com 2015-03-11 18:12:17 +00:00
parent f5a7b2b48a
commit 685b92b0a6
4 changed files with 46 additions and 0 deletions

View File

@ -200,6 +200,7 @@ extern "C" {
#define HAS_I422TOARGB4444ROW_AVX2
#define HAS_I444TOARGBROW_AVX2
#define HAS_I411TOARGBROW_AVX2
#define HAS_I400TOARGBROW_AVX2
// TODO(fbarchard): Port to Neon
#define HAS_ARGBTORGB565DITHERROW_SSE2
#define HAS_ARGBTORGB565DITHERROW_AVX2
@ -935,9 +936,11 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
void I444ToARGBRow_C(const uint8* src_y,

View File

@ -349,6 +349,14 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
}
}
#endif
#if defined(HAS_I400TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I400ToARGBRow = I400ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I400ToARGBRow = I400ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I400TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I400ToARGBRow = I400ToARGBRow_Any_NEON;

View File

@ -192,6 +192,9 @@ RGBANY(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, ARGBToARGB4444Row_C,
#if defined(HAS_I400TOARGBROW_SSE2)
RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C, 1, 4, 7)
#endif
#if defined(HAS_I400TOARGBROW_AVX2)
RGBANY(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, I400ToARGBRow_C, 1, 4, 15)
#endif
#if defined(HAS_YTOARGBROW_SSE2)
RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C, 1, 4, 7)
#endif

View File

@ -284,6 +284,38 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
}
}
#ifdef HAS_I400TOARGBROW_AVX2
// Duplicates gray value 3 times and fills in alpha opaque.
__declspec(naked) __declspec(align(16))
void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
__asm {
mov eax, [esp + 4] // src_y
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
vpslld ymm5, ymm5, 24
convertloop:
vmovdqu xmm0, [eax]
lea eax, [eax + 16]
vpermq ymm0, ymm0, 0xd8
vpunpcklbw ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 0xd8
vpunpckhwd ymm1, ymm0, ymm0
vpunpcklwd ymm0, ymm0, ymm0
vpor ymm0, ymm0, ymm5
vpor ymm1, ymm1, ymm5
vmovdqu [edx], ymm0
vmovdqu [edx + 32], ymm1
lea edx, [edx + 64]
sub ecx, 16
jg convertloop
vzeroupper
ret
}
}
#endif // HAS_I400TOARGBROW_AVX2
__declspec(naked) __declspec(align(16))
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
__asm {