mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
ARGB1555ToARGBRow_SSE2
BUG=none TEST=media_unittest Review URL: http://webrtc-codereview.appspot.com/349006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@133 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
6aa761da6d
commit
ccd6d9b2de
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 132
|
||||
Version: 133
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
80
source/row.h
80
source/row.h
@ -60,8 +60,9 @@
|
||||
|
||||
// The following are available on Windows platforms
|
||||
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
|
||||
#define HAS_ARGB4444TOARGBROW_SSE2
|
||||
#define HAS_RGB565TOARGBROW_SSE2
|
||||
#define HAS_ARGB1555TOARGBROW_SSE2
|
||||
#define HAS_ARGB4444TOARGBROW_SSE2
|
||||
#endif
|
||||
|
||||
// The following are available on Neon platforms
|
||||
@ -82,64 +83,60 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef HAS_FASTCONVERTYUVTOARGBROW_NEON
|
||||
#if defined(_MSC_VER)
|
||||
#define SIMD_ALIGNED(var) __declspec(align(16)) var
|
||||
typedef __declspec(align(16)) signed char vec8[16];
|
||||
typedef __declspec(align(16)) unsigned char uvec8[16];
|
||||
typedef __declspec(align(16)) signed short vec16[8];
|
||||
#else // __GNUC__
|
||||
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
|
||||
typedef signed char __attribute__((vector_size(16))) vec8;
|
||||
typedef unsigned char __attribute__((vector_size(16))) uvec8;
|
||||
typedef signed short __attribute__((vector_size(16))) vec16;
|
||||
#endif
|
||||
|
||||
|
||||
void FastConvertYUVToARGBRow_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#endif
|
||||
#ifdef HAS_FASTCONVERTYUVTOBGRAROW_NEON
|
||||
void FastConvertYUVToBGRARow_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#endif
|
||||
#ifdef HAS_FASTCONVERTYUVTOABGRROW_NEON
|
||||
void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#endif
|
||||
#ifdef HAS_FASTCONVERTYUVTORGB565ROW_NEON
|
||||
void FastConvertYUVToRGB565Row_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#endif
|
||||
#ifdef HAS_FASTCONVERTYUVTOARGB1555ROW_NEON
|
||||
void FastConvertYUVToARGB1555Row_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#endif
|
||||
#ifdef HAS_FASTCONVERTYUVTOARGB4444ROW_NEON
|
||||
void FastConvertYUVToARGB4444Row_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#endif
|
||||
#ifdef HAS_FASTCONVERTYUVTORGB24ROW_NEON
|
||||
void FastConvertYUVToRGB24Row_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#endif
|
||||
#ifdef HAS_FASTCONVERTYUVTORAWROW_NEON
|
||||
void FastConvertYUVToRAWRow_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#endif
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
@ -149,11 +146,6 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
#endif
|
||||
#if defined(HAS_RGB24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3)
|
||||
#define HASRGB24TOYROW_SSSE3
|
||||
#endif
|
||||
#ifdef HASRGB24TOYROW_SSSE3
|
||||
void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void RGB565ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
@ -171,16 +163,9 @@ void ARGB1555ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
void ARGB4444ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
|
||||
#endif
|
||||
#ifdef HAS_REVERSE_ROW_SSSE3
|
||||
void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width);
|
||||
#endif
|
||||
#ifdef HAS_REVERSE_ROW_SSE2
|
||||
void ReverseRow_SSE2(const uint8* src, uint8* dst, int width);
|
||||
#endif
|
||||
#ifdef HAS_REVERSE_ROW_NEON
|
||||
void ReverseRow_NEON(const uint8* src, uint8* dst, int width);
|
||||
#endif
|
||||
void ReverseRow_C(const uint8* src, uint8* dst, int width);
|
||||
|
||||
void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
@ -209,20 +194,14 @@ void ARGB1555ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
|
||||
void ARGB4444ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
|
||||
#ifdef HAS_RGB24TOARGBROW_SSSE3
|
||||
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix);
|
||||
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix);
|
||||
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
|
||||
void RAWToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
|
||||
// TODO(fbarchard): SSE2 555
|
||||
//void ARGB1555ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
|
||||
#endif
|
||||
#ifdef HAS_RGB565TOARGBROW_SSE2
|
||||
void ARGB1555ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
|
||||
void RGB565ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
|
||||
#endif
|
||||
#ifdef HAS_ARGB4444TOARGBROW_SSE2
|
||||
void ARGB4444ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
|
||||
#endif
|
||||
|
||||
void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix);
|
||||
void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix);
|
||||
void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);
|
||||
@ -231,27 +210,9 @@ void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix);
|
||||
void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
|
||||
void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
|
||||
|
||||
#ifdef HAS_I400TOARGBROW_SSE2
|
||||
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
|
||||
#endif
|
||||
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define SIMD_ALIGNED(var) __declspec(align(16)) var
|
||||
typedef __declspec(align(16)) signed char vec8[16];
|
||||
typedef __declspec(align(16)) unsigned char uvec8[16];
|
||||
typedef __declspec(align(16)) signed short vec16[8];
|
||||
#else // __GNUC__
|
||||
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
|
||||
typedef signed char __attribute__((vector_size(16))) vec8;
|
||||
typedef unsigned char __attribute__((vector_size(16))) uvec8;
|
||||
typedef signed short __attribute__((vector_size(16))) vec16;
|
||||
#endif
|
||||
|
||||
extern "C" SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
|
||||
extern "C" SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
|
||||
extern "C" SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
|
||||
|
||||
void FastConvertYUVToARGBRow_C(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
@ -310,7 +271,6 @@ void FastConvertYToARGBRow_C(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2
|
||||
void FastConvertYUVToARGBRow_SSE2(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
@ -344,9 +304,7 @@ void FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf,
|
||||
void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#endif
|
||||
|
||||
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
|
||||
void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
@ -400,15 +358,11 @@ void FastConvertYUVToRAWRow_SSSE3(const uint8* y_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#endif
|
||||
|
||||
#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
|
||||
void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -328,7 +328,11 @@ void RGB565ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
|
||||
void ARGB1555ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride]);
|
||||
#ifdef HAS_ARGB1555TOARGBROW_SSE2
|
||||
ARGB1555ToARGBRow_SSE2(src_argb, row, pix);
|
||||
#else
|
||||
ARGB1555ToARGBRow_C(src_argb, row, pix);
|
||||
#endif
|
||||
ARGBToYRow_SSSE3(row, dst_y, pix);
|
||||
}
|
||||
|
||||
@ -378,8 +382,13 @@ void RGB565ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
|
||||
void ARGB1555ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int pix) {
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
|
||||
#ifdef HAS_ARGB1555TOARGBROW_SSE2
|
||||
ARGB1555ToARGBRow_SSE2(src_argb, row, pix);
|
||||
ARGB1555ToARGBRow_SSE2(src_argb + src_stride_argb, row + kMaxStride, pix);
|
||||
#else
|
||||
ARGB1555ToARGBRow_C(src_argb, row, pix);
|
||||
ARGB1555ToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
|
||||
#endif
|
||||
ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
|
||||
}
|
||||
|
||||
|
||||
@ -229,53 +229,50 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef SHIFT565
|
||||
// Below shift/mask code is efficient and works, but more instructions than
|
||||
// pmul method
|
||||
// TODO(fbarchard): Port RGB565ToARGBRow_SSE2 to gcc
|
||||
// 29 instructions
|
||||
__declspec(naked)
|
||||
void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
|
||||
int pix) {
|
||||
void OldRGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
|
||||
int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_rgb565
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0xff000000 for Alpha
|
||||
pslld xmm5, 24
|
||||
pcmpeqb xmm4, xmm4 // generate mask 0xf800f800
|
||||
pcmpeqb xmm4, xmm4 // generate mask 0xf800f800 for Red
|
||||
psllw xmm4, 11
|
||||
pcmpeqb xmm6, xmm6 // generate mask 0x001f001f
|
||||
pcmpeqb xmm6, xmm6 // generate mask 0x001f001f for Blue
|
||||
psrlw xmm6, 11
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0x00fc00fc
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0x00fc00fc for Green
|
||||
psrlw xmm7, 10
|
||||
psllw xmm7, 2
|
||||
|
||||
|
||||
convertloop:
|
||||
movdqa xmm0, [eax] // fetch 8 pixels of bgr565
|
||||
lea eax, [eax + 16]
|
||||
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm2, xmm0
|
||||
pand xmm1, xmm4 // R in upper 5 bits
|
||||
psrlw xmm2, 13 // R 3 bits
|
||||
psllw xmm2, 8
|
||||
por xmm1, xmm2
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
pand xmm2, xmm6 // mask B 5 bits
|
||||
movdqa xmm3, xmm2
|
||||
psllw xmm2, 3
|
||||
psrlw xmm3, 2
|
||||
por xmm2, xmm3
|
||||
|
||||
por xmm1, xmm2 // RB
|
||||
|
||||
psrlw xmm0, 3 // G in top 6 bits of lower byte
|
||||
pand xmm0, xmm7 // mask G 6 bits
|
||||
movdqa xmm2, xmm0
|
||||
psrlw xmm2, 6
|
||||
por xmm0, xmm2
|
||||
|
||||
por xmm0, xmm5 // AG
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
punpcklbw xmm1, xmm0
|
||||
punpckhbw xmm2, xmm0
|
||||
@ -288,7 +285,177 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Port ARGB1555ToARGBRow_SSE2 to gcc
|
||||
// 33 instructions
|
||||
__declspec(naked)
|
||||
void OldARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
|
||||
int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_argb1555
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 for Alpha
|
||||
psllw xmm5, 8
|
||||
pcmpeqb xmm4, xmm4 // generate mask 0xf800f800 for Red
|
||||
psllw xmm4, 11
|
||||
pcmpeqb xmm6, xmm6 // generate mask 0x001f001f for Blue
|
||||
psrlw xmm6, 11
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0x00f800f8 for Green
|
||||
psrlw xmm7, 11
|
||||
psllw xmm7, 3
|
||||
|
||||
convertloop:
|
||||
movdqa xmm0, [eax] // fetch 8 pixels of bgr565
|
||||
lea eax, [eax + 16]
|
||||
movdqa xmm1, xmm0
|
||||
psllw xmm1, 1
|
||||
movdqa xmm2, xmm0
|
||||
pand xmm1, xmm4 // R in upper 5 bits
|
||||
psrlw xmm2, 13 // R 3 bits
|
||||
psllw xmm2, 8
|
||||
por xmm1, xmm2
|
||||
movdqa xmm2, xmm0
|
||||
pand xmm2, xmm6 // mask B 5 bits
|
||||
movdqa xmm3, xmm2
|
||||
psllw xmm2, 3
|
||||
psrlw xmm3, 2
|
||||
por xmm2, xmm3
|
||||
por xmm1, xmm2 // RB
|
||||
movdqa xmm2, xmm0
|
||||
psrlw xmm2, 2 // G in top 5 bits of lower byte
|
||||
pand xmm2, xmm7 // mask G 5 bits
|
||||
movdqa xmm3, xmm2
|
||||
psrlw xmm3, 5
|
||||
por xmm2, xmm3
|
||||
psraw xmm0, 8 // A
|
||||
pand xmm0, xmm5
|
||||
por xmm0, xmm2 // AG
|
||||
movdqa xmm2, xmm1
|
||||
punpcklbw xmm1, xmm0
|
||||
punpckhbw xmm2, xmm0
|
||||
movdqa [edx], xmm1 // store 4 pixels of ARGB
|
||||
movdqa [edx + 16], xmm2 // store next 4 pixels of ARGB
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// pmul method to replicate bits
|
||||
// Math to replicate bits
|
||||
// (v << 8) | (v << 3)
|
||||
// v * 256 + v * 8
|
||||
// v * (256 + 8)
|
||||
// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
|
||||
// 20 instructions
|
||||
__declspec(naked)
|
||||
void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
|
||||
int pix) {
|
||||
__asm {
|
||||
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
|
||||
movd xmm5, eax
|
||||
pshufd xmm5, xmm5, 0
|
||||
mov eax, 0x20082008 // multiplier shift by 5 and then repeat 6 bits
|
||||
movd xmm6, eax
|
||||
pshufd xmm6, xmm6, 0
|
||||
pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
|
||||
psllw xmm3, 11
|
||||
pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
|
||||
psllw xmm4, 10
|
||||
psrlw xmm4, 5
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
|
||||
psllw xmm7, 8
|
||||
|
||||
mov eax, [esp + 4] // src_rgb565
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
sub edx, eax
|
||||
sub edx, eax
|
||||
|
||||
convertloop:
|
||||
movdqa xmm0, [eax] // fetch 8 pixels of bgr565
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm2, xmm0
|
||||
pand xmm1, xmm3 // R in upper 5 bits
|
||||
psllw xmm2, 11 // B in upper 5 bits
|
||||
pmulhuw xmm1, xmm5 // * (256 + 8)
|
||||
pmulhuw xmm2, xmm5 // * (256 + 8)
|
||||
psllw xmm1, 8
|
||||
por xmm1, xmm2 // RB
|
||||
pand xmm0, xmm4 // G in middle 6 bits
|
||||
pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
|
||||
por xmm0, xmm7 // AG
|
||||
movdqa xmm2, xmm1
|
||||
punpcklbw xmm1, xmm0
|
||||
punpckhbw xmm2, xmm0
|
||||
movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
|
||||
movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
|
||||
lea eax, [eax + 16]
|
||||
sub ecx, 8
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Port ARGB1555ToARGBRow_SSE2 to gcc
|
||||
// 24 instructions
|
||||
__declspec(naked)
|
||||
void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
|
||||
int pix) {
|
||||
__asm {
|
||||
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
|
||||
movd xmm5, eax
|
||||
pshufd xmm5, xmm5, 0
|
||||
mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
|
||||
movd xmm6, eax
|
||||
pshufd xmm6, xmm6, 0
|
||||
pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
|
||||
psllw xmm3, 11
|
||||
pcmpeqb xmm4, xmm4 // generate mask 0x03e003e0 for Green
|
||||
psllw xmm4, 11
|
||||
psrlw xmm4, 6
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
|
||||
psllw xmm7, 8
|
||||
|
||||
mov eax, [esp + 4] // src_argb1555
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
sub edx, eax
|
||||
sub edx, eax
|
||||
|
||||
convertloop:
|
||||
movdqa xmm0, [eax] // fetch 8 pixels of 1555
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm2, xmm0
|
||||
psllw xmm1, 1 // R in upper 5 bits
|
||||
psllw xmm2, 11 // B in upper 5 bits
|
||||
pand xmm1, xmm3
|
||||
pmulhuw xmm2, xmm5 // * (256 + 8)
|
||||
pmulhuw xmm1, xmm5 // * (256 + 8)
|
||||
psllw xmm1, 8
|
||||
por xmm1, xmm2 // RB
|
||||
movdqa xmm2, xmm0
|
||||
pand xmm0, xmm4 // G in middle 5 bits
|
||||
psraw xmm2, 8 // A
|
||||
pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
|
||||
pand xmm2, xmm7
|
||||
por xmm0, xmm2 // AG
|
||||
movdqa xmm2, xmm1
|
||||
punpcklbw xmm1, xmm0
|
||||
punpckhbw xmm2, xmm0
|
||||
movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
|
||||
movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
|
||||
lea eax, [eax + 16]
|
||||
sub ecx, 8
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Port ARGB4444ToARGBRow_SSE2 to gcc
|
||||
// 18 instructions
|
||||
__declspec(naked)
|
||||
void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
|
||||
int pix) {
|
||||
@ -301,10 +468,11 @@ __asm {
|
||||
mov eax, [esp + 4] // src_argb4444
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
sub edx, eax
|
||||
sub edx, eax
|
||||
|
||||
convertloop:
|
||||
movdqa xmm0, qword ptr [eax] // fetch 8 pixels of bgra4444
|
||||
lea eax, [eax + 16]
|
||||
movdqa xmm0, [eax] // fetch 8 pixels of bgra4444
|
||||
movdqa xmm2, xmm0
|
||||
pand xmm0, xmm4 // mask low nibbles
|
||||
pand xmm2, xmm5 // mask high nibbles
|
||||
@ -317,9 +485,9 @@ __asm {
|
||||
movdqa xmm1, xmm0
|
||||
punpcklbw xmm0, xmm2
|
||||
punpckhbw xmm1, xmm2
|
||||
movdqa [edx], xmm0 // store 4 pixels of ARGB
|
||||
movdqa [edx + 16], xmm1 // store next 4 pixels of ARGB
|
||||
lea edx, [edx + 32]
|
||||
movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
|
||||
movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
|
||||
lea eax, [eax + 16]
|
||||
sub ecx, 8
|
||||
ja convertloop
|
||||
ret
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user