Optimize unlimited data for Intel

Use unsigned coefficient and signed UV value in YUVTORGB.

R=fbarchard@chromium.org

Bug: libyuv:862, libyuv:863
Change-Id: I32e58b2cee383fb98104c055beb0867a7ad05bfe
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2850016
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Yuan Tong 2021-04-27 22:47:36 +08:00 committed by Frank Barchard
parent 5e05f26a2b
commit c9843de02a
5 changed files with 300 additions and 335 deletions

View File

@ -730,25 +730,16 @@ struct YuvConstants {
uint8_t kUVToB[32]; uint8_t kUVToB[32];
uint8_t kUVToG[32]; uint8_t kUVToG[32];
uint8_t kUVToR[32]; uint8_t kUVToR[32];
int16_t kUVBiasB[16];
int16_t kUVBiasG[16];
int16_t kUVBiasR[16];
int16_t kYToRgb[16]; int16_t kYToRgb[16];
int16_t kYBiasToRgb[16]; int16_t kYBiasToRgb[16];
uint8_t kUVMaskBR[32];
}; };
// Offsets into YuvConstants structure // Offsets into YuvConstants structure
#define KUVTOB 0 #define KUVTOB 0
#define KUVTOG 32 #define KUVTOG 32
#define KUVTOR 64 #define KUVTOR 64
#define KUVBIASB 96 #define KYTORGB 96
#define KUVBIASG 128 #define KYBIASTORGB 128
#define KUVBIASR 160
#define KYTORGB 192
#define KYBIASTORGB 224
#define KUMASKB 256
#define KVMASKR 272
#endif #endif

View File

@ -55,8 +55,8 @@ static __inline int32_t clamp1023(int32_t v) {
return (-(v >= 1023) | v) & 1023; return (-(v >= 1023) | v) & 1023;
} }
// clamp to 2^n - 1 // clamp to max
static __inline int32_t clamp2nm1(int32_t v, int32_t max) { static __inline int32_t ClampMax(int32_t v, int32_t max) {
return (-(v >= max) | v) & max; return (-(v >= max) | v) & max;
} }
@ -77,7 +77,7 @@ static __inline int32_t clamp1023(int32_t v) {
return (v > 1023) ? 1023 : v; return (v > 1023) ? 1023 : v;
} }
static __inline int32_t clamp2nm1(int32_t v, int32_t max) { static __inline int32_t ClampMax(int32_t v, int32_t max) {
return (v > max) ? max : v; return (v > max) ? max : v;
} }
@ -1422,46 +1422,37 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
// clang-format off // clang-format off
#if defined(__aarch64__) || defined(__arm__) #if defined(__aarch64__) || defined(__arm__)
#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR, BB, BG, BR) \ // Bias values to round, and subtract 128 from U and V.
{{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \ // For B and R this is negative. For G this is positive.
#define BB (UB * 128 - YB)
#define BG (UG * 128 + VG * 128 + YB)
#define BR (VR * 128 - YB)
#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \
{{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \
{YG, BB, BG, BR, YB, 0, 0, 0}} {YG, BB, BG, BR, YB, 0, 0, 0}}
#else #else
#define UVMASK(C) ((C) > 127 ? 0xff : 0) #define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \
#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR, BB, BG, BR) \
{{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, \ {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, \
UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, \ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, \
{UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, \ {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, \
UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
{0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, \ {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, \
0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, \ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, \
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, \
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, \
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, \
{YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
{YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}, \ {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}
{0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB), \
0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB), \
0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR), \
0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR)}}
#endif #endif
// clang-format on // clang-format on
#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR) \ #define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR) \
const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \ const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \
YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR, BB, BG, BR); \ YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR); \
const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \ const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \
YUBCONSTANTSBODY(YG, YB, VR, VG, UG, UB, BR, BG, BB); YUBCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
// TODO(fbarchard): Generate SIMD structures from float matrix. // TODO(fbarchard): Generate SIMD structures from float matrix.
// Bias values to round, and subtract 128 from U and V.
// For B and R this is negative. For G this is positive.
#define BB (UB * 128 - YB)
#define BG (UG * 128 + VG * 128 + YB)
#define BR (VR * 128 - YB)
// BT.601 limited range YUV to RGB reference // BT.601 limited range YUV to RGB reference
// R = (Y - 16) * 1.164 + V * 1.596 // R = (Y - 16) * 1.164 + V * 1.596
// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
@ -1482,7 +1473,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR, BB, BG, BR) MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR)
#undef YG #undef YG
#undef YB #undef YB
@ -1507,7 +1498,7 @@ MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR, BB, BG, BR)
#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
#define YB 32 /* 64 / 2 */ #define YB 32 /* 64 / 2 */
MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR) MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR)
#undef YG #undef YG
#undef YB #undef YB
@ -1536,7 +1527,7 @@ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR)
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR, BB, BG, BR) MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR)
#undef YG #undef YG
#undef YB #undef YB
@ -1561,7 +1552,7 @@ MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */ #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
#define YB 32 /* 64 / 2 */ #define YB 32 /* 64 / 2 */
MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR) MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR)
#undef YG #undef YG
#undef YB #undef YB
@ -1590,7 +1581,7 @@ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */ #define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
#define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */ #define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR, BB, BG, BR) MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR)
#undef YG #undef YG
#undef YB #undef YB
@ -1614,7 +1605,7 @@ MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */ #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
#define YB 32 /* 64 / 2 */ #define YB 32 /* 64 / 2 */
MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR) MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR)
#undef YG #undef YG
#undef YB #undef YB
@ -1630,25 +1621,39 @@ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
#undef MAKEYUVCONSTANTS #undef MAKEYUVCONSTANTS
#if defined(__aarch64__) || defined(__arm__) #if defined(__aarch64__) || defined(__arm__)
#define LOAD_YUV_CONSTANTS \ #define LOAD_YUV_CONSTANTS \
int ub = -yuvconstants->kUVCoeff[0]; \ int ub = yuvconstants->kUVCoeff[0]; \
int vr = -yuvconstants->kUVCoeff[1]; \ int vr = yuvconstants->kUVCoeff[1]; \
int ug = yuvconstants->kUVCoeff[2]; \ int ug = yuvconstants->kUVCoeff[2]; \
int vg = yuvconstants->kUVCoeff[3]; \ int vg = yuvconstants->kUVCoeff[3]; \
int yg = yuvconstants->kRGBCoeffBias[0]; \ int yg = yuvconstants->kRGBCoeffBias[0]; \
int bb = -yuvconstants->kRGBCoeffBias[1]; \ int bb = yuvconstants->kRGBCoeffBias[1]; \
int bg = yuvconstants->kRGBCoeffBias[2]; \ int bg = yuvconstants->kRGBCoeffBias[2]; \
int br = -yuvconstants->kRGBCoeffBias[3] int br = yuvconstants->kRGBCoeffBias[3]
#define CALC_RGB16 \
int32_t y1 = (uint32_t)(y32 * yg) >> 16; \
int b16 = y1 + (u * ub) - bb; \
int g16 = y1 + bg - (u * ug + v * vg); \
int r16 = y1 + (v * vr) - br
#else #else
#define LOAD_YUV_CONSTANTS \ #define LOAD_YUV_CONSTANTS \
int ub = -yuvconstants->kUVToB[0]; \ int ub = yuvconstants->kUVToB[0]; \
int ug = yuvconstants->kUVToG[0]; \ int ug = yuvconstants->kUVToG[0]; \
int vg = yuvconstants->kUVToG[1]; \ int vg = yuvconstants->kUVToG[1]; \
int vr = -yuvconstants->kUVToR[1]; \ int vr = yuvconstants->kUVToR[1]; \
int bb = -yuvconstants->kUVBiasB[0]; \ int yg = yuvconstants->kYToRgb[0]; \
int bg = yuvconstants->kUVBiasG[0]; \ int yb = yuvconstants->kYBiasToRgb[0]
int br = -yuvconstants->kUVBiasR[0]; \
int yg = yuvconstants->kYToRgb[0] #define CALC_RGB16 \
int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \
int8_t ui = u; \
int8_t vi = v; \
ui -= 0x80; \
vi -= 0x80; \
int b16 = y1 + (ui * ub); \
int g16 = y1 - (ui * ug + vi * vg); \
int r16 = y1 + (vi * vr)
#endif #endif
// C reference code that mimics the YUV assembly. // C reference code that mimics the YUV assembly.
@ -1661,11 +1666,11 @@ static __inline void YuvPixel(uint8_t y,
uint8_t* r, uint8_t* r,
const struct YuvConstants* yuvconstants) { const struct YuvConstants* yuvconstants) {
LOAD_YUV_CONSTANTS; LOAD_YUV_CONSTANTS;
uint32_t y32 = y * 0x0101;
uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; CALC_RGB16;
*b = Clamp((int32_t)(y1 - (u * ub) + bb) >> 6); *b = Clamp((int32_t)(b16) >> 6);
*g = Clamp((int32_t)(y1 - (u * ug + v * vg) + bg) >> 6); *g = Clamp((int32_t)(g16) >> 6);
*r = Clamp((int32_t)(y1 - (v * vr) + br) >> 6); *r = Clamp((int32_t)(r16) >> 6);
} }
// Reads 8 bit YUV and leaves result as 16 bit. // Reads 8 bit YUV and leaves result as 16 bit.
@ -1677,11 +1682,11 @@ static __inline void YuvPixel8_16(uint8_t y,
int* r, int* r,
const struct YuvConstants* yuvconstants) { const struct YuvConstants* yuvconstants) {
LOAD_YUV_CONSTANTS; LOAD_YUV_CONSTANTS;
uint32_t y32 = y * 0x0101;
uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; CALC_RGB16;
*b = (int)(y1 - (u * ub) + bb); *b = b16;
*g = (int)(y1 - (u * ug + v * vg) + bg); *g = g16;
*r = (int)(y1 - (v * vr) + br); *r = r16;
} }
// C reference code that mimics the YUV 16 bit assembly. // C reference code that mimics the YUV 16 bit assembly.
@ -1694,13 +1699,13 @@ static __inline void YuvPixel10_16(uint16_t y,
int* r, int* r,
const struct YuvConstants* yuvconstants) { const struct YuvConstants* yuvconstants) {
LOAD_YUV_CONSTANTS; LOAD_YUV_CONSTANTS;
uint32_t y32 = y << 6;
uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16;
u = clamp255(u >> 2); u = clamp255(u >> 2);
v = clamp255(v >> 2); v = clamp255(v >> 2);
*b = (int)(-(u * ub) + y1 + bb); CALC_RGB16;
*g = (int)(-(u * ug + v * vg) + y1 + bg); *b = b16;
*r = (int)(-(v * vr) + y1 + br); *g = g16;
*r = r16;
} }
// C reference code that mimics the YUV 16 bit assembly. // C reference code that mimics the YUV 16 bit assembly.
@ -1713,13 +1718,13 @@ static __inline void YuvPixel12_16(int16_t y,
int* r, int* r,
const struct YuvConstants* yuvconstants) { const struct YuvConstants* yuvconstants) {
LOAD_YUV_CONSTANTS; LOAD_YUV_CONSTANTS;
uint32_t y32 = y << 4;
uint32_t y1 = (uint32_t)((y << 4) * yg) >> 16;
u = clamp255(u >> 4); u = clamp255(u >> 4);
v = clamp255(v >> 4); v = clamp255(v >> 4);
*b = (int)(-(u * ub) + y1 + bb); CALC_RGB16;
*g = (int)(-(u * ug + v * vg) + y1 + bg); *b = b16;
*r = (int)(-(v * vr) + y1 + br); *g = g16;
*r = r16;
} }
// C reference code that mimics the YUV 10 bit assembly. // C reference code that mimics the YUV 10 bit assembly.
@ -1768,13 +1773,13 @@ static __inline void YuvPixel16_8(uint16_t y,
uint8_t* r, uint8_t* r,
const struct YuvConstants* yuvconstants) { const struct YuvConstants* yuvconstants) {
LOAD_YUV_CONSTANTS; LOAD_YUV_CONSTANTS;
uint32_t y32 = y;
uint32_t y1 = (uint32_t)(y * yg) >> 16;
u = clamp255(u >> 8); u = clamp255(u >> 8);
v = clamp255(v >> 8); v = clamp255(v >> 8);
*b = Clamp((int32_t)(y1 + -(u * ub) + bb) >> 6); CALC_RGB16;
*g = Clamp((int32_t)(y1 + -(u * ug + v * vg) + bg) >> 6); *b = Clamp((int32_t)(b16) >> 6);
*r = Clamp((int32_t)(y1 + -(v * vr) + br) >> 6); *g = Clamp((int32_t)(g16) >> 6);
*r = Clamp((int32_t)(r16) >> 6);
} }
// C reference code that mimics the YUV 16 bit assembly. // C reference code that mimics the YUV 16 bit assembly.
@ -1787,13 +1792,13 @@ static __inline void YuvPixel16_16(uint16_t y,
int* r, int* r,
const struct YuvConstants* yuvconstants) { const struct YuvConstants* yuvconstants) {
LOAD_YUV_CONSTANTS; LOAD_YUV_CONSTANTS;
uint32_t y32 = y;
uint32_t y1 = (uint32_t)(y * yg) >> 16;
u = clamp255(u >> 8); u = clamp255(u >> 8);
v = clamp255(v >> 8); v = clamp255(v >> 8);
*b = (int)(y1 + -(u * ub) + bb); CALC_RGB16;
*g = (int)(y1 + -(u * ug + v * vg) + bg); *b = b16;
*r = (int)(y1 + -(v * vr) + br); *g = g16;
*r = r16;
} }
// C reference code that mimics the YUV assembly. // C reference code that mimics the YUV assembly.
@ -2779,10 +2784,10 @@ void MergeAR64Row_C(const uint16_t* src_r,
int shift = 16 - depth; int shift = 16 - depth;
int max = (1 << depth) - 1; int max = (1 << depth) - 1;
for (x = 0; x < width; ++x) { for (x = 0; x < width; ++x) {
dst_ar64[0] = clamp2nm1(src_b[x], max) << shift; dst_ar64[0] = ClampMax(src_b[x], max) << shift;
dst_ar64[1] = clamp2nm1(src_g[x], max) << shift; dst_ar64[1] = ClampMax(src_g[x], max) << shift;
dst_ar64[2] = clamp2nm1(src_r[x], max) << shift; dst_ar64[2] = ClampMax(src_r[x], max) << shift;
dst_ar64[3] = clamp2nm1(src_a[x], max) << shift; dst_ar64[3] = ClampMax(src_a[x], max) << shift;
dst_ar64 += 4; dst_ar64 += 4;
} }
} }
@ -2819,9 +2824,9 @@ void MergeXR64Row_C(const uint16_t* src_r,
int shift = 16 - depth; int shift = 16 - depth;
int max = (1 << depth) - 1; int max = (1 << depth) - 1;
for (x = 0; x < width; ++x) { for (x = 0; x < width; ++x) {
dst_ar64[0] = clamp2nm1(src_b[x], max) << shift; dst_ar64[0] = ClampMax(src_b[x], max) << shift;
dst_ar64[1] = clamp2nm1(src_g[x], max) << shift; dst_ar64[1] = ClampMax(src_g[x], max) << shift;
dst_ar64[2] = clamp2nm1(src_r[x], max) << shift; dst_ar64[2] = ClampMax(src_r[x], max) << shift;
dst_ar64[3] = 0xffff; dst_ar64[3] = 0xffff;
dst_ar64 += 4; dst_ar64 += 4;
} }

View File

@ -2312,78 +2312,65 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
#if defined(__x86_64__) #if defined(__x86_64__)
#define YUVTORGB_SETUP(yuvconstants) \ #define YUVTORGB_SETUP(yuvconstants) \
"pcmpeqb %%xmm13,%%xmm13 \n" \
"movdqa (%[yuvconstants]),%%xmm8 \n" \ "movdqa (%[yuvconstants]),%%xmm8 \n" \
"pxor %%xmm12,%%xmm12 \n" \
"movdqa 32(%[yuvconstants]),%%xmm9 \n" \ "movdqa 32(%[yuvconstants]),%%xmm9 \n" \
"psllw $7,%%xmm13 \n" \
"movdqa 64(%[yuvconstants]),%%xmm10 \n" \ "movdqa 64(%[yuvconstants]),%%xmm10 \n" \
"pshufb %%xmm12,%%xmm13 \n" \
"movdqa 96(%[yuvconstants]),%%xmm11 \n" \ "movdqa 96(%[yuvconstants]),%%xmm11 \n" \
"movdqa 128(%[yuvconstants]),%%xmm12 \n" \ "movdqa 128(%[yuvconstants]),%%xmm12 \n"
"movdqa 160(%[yuvconstants]),%%xmm13 \n" \
"movdqa 192(%[yuvconstants]),%%xmm14 \n" \
"movdqa 256(%[yuvconstants]),%%xmm15 \n" \
"movdqa 272(%[yuvconstants]),%%xmm7 \n"
// Convert 8 pixels: 8 UV and 8 Y // Convert 8 pixels: 8 UV and 8 Y
#define YUVTORGB16(yuvconstants) \ #define YUVTORGB16(yuvconstants) \
"movdqa %%xmm3,%%xmm0 \n" \ "psubb %%xmm13,%%xmm3 \n" \
"movdqa %%xmm3,%%xmm1 \n" \ "pmulhuw %%xmm11,%%xmm4 \n" \
"movdqa %%xmm3,%%xmm2 \n" \ "movdqa %%xmm8,%%xmm0 \n" \
"pmaddubsw %%xmm8,%%xmm0 \n" \ "movdqa %%xmm9,%%xmm1 \n" \
"pmaddubsw %%xmm10,%%xmm2 \n" \ "movdqa %%xmm10,%%xmm2 \n" \
"psllw $8,%%xmm1 \n" \
"pand %%xmm15,%%xmm1 \n" \
"paddw %%xmm1,%%xmm0 \n" \
"movdqa %%xmm3,%%xmm1 \n" \
"pmaddubsw %%xmm9,%%xmm1 \n" \
"pmulhuw %%xmm14,%%xmm4 \n" \
"pand %%xmm7,%%xmm3 \n" \
"paddw %%xmm3,%%xmm2 \n" \
"paddw %%xmm4,%%xmm0 \n" \
"paddw %%xmm4,%%xmm2 \n" \
"paddw %%xmm12,%%xmm4 \n" \ "paddw %%xmm12,%%xmm4 \n" \
"psubusw %%xmm11,%%xmm0 \n" \ "pmaddubsw %%xmm3,%%xmm0 \n" \
"psubusw %%xmm1,%%xmm4 \n" \ "pmaddubsw %%xmm3,%%xmm1 \n" \
"psubusw %%xmm13,%%xmm2 \n" \ "pmaddubsw %%xmm3,%%xmm2 \n" \
"paddsw %%xmm4,%%xmm0 \n" \
"paddsw %%xmm4,%%xmm2 \n" \
"psubsw %%xmm1,%%xmm4 \n" \
"movdqa %%xmm4,%%xmm1 \n" "movdqa %%xmm4,%%xmm1 \n"
#define YUVTORGB_REGS \ #define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
"xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
#else #else
#define YUVTORGB_SETUP(yuvconstants) #define YUVTORGB_SETUP(yuvconstants)
// Convert 8 pixels: 8 UV and 8 Y // Convert 8 pixels: 8 UV and 8 Y
#define YUVTORGB16(yuvconstants) \ #define YUVTORGB16(yuvconstants) \
"movdqa %%xmm3,%%xmm0 \n" \ "pcmpeqb %%xmm0,%%xmm0 \n" \
"movdqa %%xmm3,%%xmm1 \n" \ "pxor %%xmm1,%%xmm1 \n" \
"movdqa %%xmm3,%%xmm2 \n" \ "psllw $7,%%xmm0 \n" \
"pmaddubsw (%[yuvconstants]),%%xmm0 \n" \ "pshufb %%xmm1,%%xmm0 \n" \
"pmaddubsw 64(%[yuvconstants]),%%xmm2 \n" \ "psubb %%xmm0,%%xmm3 \n" \
"psllw $8,%%xmm1 \n" \ "pmulhuw 96(%[yuvconstants]),%%xmm4 \n" \
"pand 256(%[yuvconstants]),%%xmm1 \n" \ "movdqa (%[yuvconstants]),%%xmm0 \n" \
"paddw %%xmm1,%%xmm0 \n" \ "movdqa 32(%[yuvconstants]),%%xmm1 \n" \
"movdqa %%xmm3,%%xmm1 \n" \ "movdqa 64(%[yuvconstants]),%%xmm2 \n" \
"pmaddubsw 32(%[yuvconstants]),%%xmm1 \n" \ "pmaddubsw %%xmm3,%%xmm0 \n" \
"pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \ "pmaddubsw %%xmm3,%%xmm1 \n" \
"pand 272(%[yuvconstants]),%%xmm3 \n" \ "pmaddubsw %%xmm3,%%xmm2 \n" \
"paddw %%xmm3,%%xmm2 \n" \ "movdqa 128(%[yuvconstants]),%%xmm3 \n" \
"movdqa 128(%[yuvconstants]),%%xmm7 \n" \ "paddw %%xmm3,%%xmm4 \n" \
"paddw %%xmm4,%%xmm0 \n" \ "paddsw %%xmm4,%%xmm0 \n" \
"paddw %%xmm4,%%xmm2 \n" \ "paddsw %%xmm4,%%xmm2 \n" \
"paddw %%xmm7,%%xmm4 \n" \ "psubsw %%xmm1,%%xmm4 \n" \
"movdqa 96(%[yuvconstants]),%%xmm7 \n" \ "movdqa %%xmm4,%%xmm1 \n"
"psubusw %%xmm7,%%xmm0 \n" \
"psubusw %%xmm1,%%xmm4 \n" \
"movdqa 160(%[yuvconstants]),%%xmm7 \n" \
"psubusw %%xmm7,%%xmm2 \n" \
"movdqa %%xmm4,%%xmm1 \n" \
#define YUVTORGB_REGS "xmm7", #define YUVTORGB_REGS
#endif #endif
#define YUVTORGB(yuvconstants) \ #define YUVTORGB(yuvconstants) \
YUVTORGB16(yuvconstants) \ YUVTORGB16(yuvconstants) \
"psrlw $0x6,%%xmm0 \n" \ "psraw $0x6,%%xmm0 \n" \
"psrlw $0x6,%%xmm1 \n" \ "psraw $0x6,%%xmm1 \n" \
"psrlw $0x6,%%xmm2 \n" \ "psraw $0x6,%%xmm2 \n" \
"packuswb %%xmm0,%%xmm0 \n" \ "packuswb %%xmm0,%%xmm0 \n" \
"packuswb %%xmm1,%%xmm1 \n" \ "packuswb %%xmm1,%%xmm1 \n" \
"packuswb %%xmm2,%%xmm2 \n" "packuswb %%xmm2,%%xmm2 \n"
@ -2416,9 +2403,12 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
"psraw $0x4,%%xmm0 \n" \ "psraw $0x4,%%xmm0 \n" \
"psraw $0x4,%%xmm1 \n" \ "psraw $0x4,%%xmm1 \n" \
"psraw $0x4,%%xmm2 \n" \ "psraw $0x4,%%xmm2 \n" \
"pminuw %%xmm6,%%xmm0 \n" \ "pminsw %%xmm7,%%xmm0 \n" \
"pminuw %%xmm6,%%xmm1 \n" \ "pminsw %%xmm7,%%xmm1 \n" \
"pminuw %%xmm6,%%xmm2 \n" \ "pminsw %%xmm7,%%xmm2 \n" \
"pmaxsw %%xmm6,%%xmm0 \n" \
"pmaxsw %%xmm6,%%xmm1 \n" \
"pmaxsw %%xmm6,%%xmm2 \n" \
"psllw $0x4,%%xmm2 \n" \ "psllw $0x4,%%xmm2 \n" \
"movdqa %%xmm0,%%xmm3 \n" \ "movdqa %%xmm0,%%xmm3 \n" \
"punpcklwd %%xmm2,%%xmm0 \n" \ "punpcklwd %%xmm2,%%xmm0 \n" \
@ -2588,8 +2578,9 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
"pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
"psrlw $14,%%xmm5 \n" "psrlw $14,%%xmm5 \n"
"psllw $4,%%xmm5 \n" // 2 alpha bits "psllw $4,%%xmm5 \n" // 2 alpha bits
"pcmpeqb %%xmm6,%%xmm6 \n" "pxor %%xmm6,%%xmm6 \n" // 0 for min
"psrlw $6,%%xmm6 \n" // 1023 for max "pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -2605,7 +2596,7 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS : "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
); );
} }
@ -2682,8 +2673,9 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $14,%%xmm5 \n" "psrlw $14,%%xmm5 \n"
"psllw $4,%%xmm5 \n" // 2 alpha bits "psllw $4,%%xmm5 \n" // 2 alpha bits
"pcmpeqb %%xmm6,%%xmm6 \n" "pxor %%xmm6,%%xmm6 \n" // 0 for min
"psrlw $6,%%xmm6 \n" // 1023 for max "pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -2699,7 +2691,7 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS : "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
); );
} }
@ -2716,8 +2708,9 @@ void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $14,%%xmm5 \n" "psrlw $14,%%xmm5 \n"
"psllw $4,%%xmm5 \n" // 2 alpha bits "psllw $4,%%xmm5 \n" // 2 alpha bits
"pcmpeqb %%xmm6,%%xmm6 \n" "pxor %%xmm6,%%xmm6 \n" // 0 for min
"psrlw $6,%%xmm6 \n" // 1023 for max "pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -2733,7 +2726,7 @@ void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS : "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
); );
} }
@ -2850,8 +2843,9 @@ void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $14,%%xmm5 \n" "psrlw $14,%%xmm5 \n"
"psllw $4,%%xmm5 \n" // 2 alpha bits "psllw $4,%%xmm5 \n" // 2 alpha bits
"pcmpeqb %%xmm6,%%xmm6 \n" "pxor %%xmm6,%%xmm6 \n" // 0 for min
"psrlw $6,%%xmm6 \n" // 1023 for max "pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -2867,7 +2861,7 @@ void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS : "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
); );
} }
@ -3076,8 +3070,9 @@ void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $14,%%xmm5 \n" "psrlw $14,%%xmm5 \n"
"psllw $4,%%xmm5 \n" // 2 alpha bits "psllw $4,%%xmm5 \n" // 2 alpha bits
"pcmpeqb %%xmm6,%%xmm6 \n" "pxor %%xmm6,%%xmm6 \n" // 0 for min
"psrlw $6,%%xmm6 \n" // 1023 for max "pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -3092,7 +3087,7 @@ void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS : "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
); );
} }
@ -3106,8 +3101,9 @@ void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $14,%%xmm5 \n" "psrlw $14,%%xmm5 \n"
"psllw $4,%%xmm5 \n" // 2 alpha bits "psllw $4,%%xmm5 \n" // 2 alpha bits
"pcmpeqb %%xmm6,%%xmm6 \n" "pxor %%xmm6,%%xmm6 \n" // 0 for min
"psrlw $6,%%xmm6 \n" // 1023 for max "pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -3122,7 +3118,7 @@ void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS : "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
); );
} }
@ -3360,70 +3356,58 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
#if defined(__x86_64__) #if defined(__x86_64__)
#define YUVTORGB_SETUP_AVX2(yuvconstants) \ #define YUVTORGB_SETUP_AVX2(yuvconstants) \
"vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \
"vmovdqa (%[yuvconstants]),%%ymm8 \n" \ "vmovdqa (%[yuvconstants]),%%ymm8 \n" \
"vpsllw $7,%%xmm13,%%xmm13 \n" \
"vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \ "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \
"vpbroadcastb %%xmm13,%%ymm13 \n" \
"vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \ "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \
"vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \ "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
"vmovdqa 128(%[yuvconstants]),%%ymm12 \n" \ "vmovdqa 128(%[yuvconstants]),%%ymm12 \n"
"vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \
"vmovdqa 192(%[yuvconstants]),%%ymm14 \n" \
"vbroadcastf128 256(%[yuvconstants]),%%ymm15 \n" \
"vbroadcastf128 272(%[yuvconstants]),%%ymm7 \n"
// TODO(yuan): Consider signed UV and unsigned coefficient for vpmaddubsw.
#define YUVTORGB16_AVX2(yuvconstants) \ #define YUVTORGB16_AVX2(yuvconstants) \
"vpmaddubsw %%ymm8,%%ymm3,%%ymm0 \n" \ "vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \
"vpmaddubsw %%ymm10,%%ymm3,%%ymm2 \n" \ "vpmulhuw %%ymm11,%%ymm4,%%ymm4 \n" \
"vpsllw $8,%%ymm3,%%ymm1 \n" \ "vpmaddubsw %%ymm3,%%ymm8,%%ymm0 \n" \
"vpand %%ymm1,%%ymm15,%%ymm1 \n" \ "vpmaddubsw %%ymm3,%%ymm9,%%ymm1 \n" \
"vpaddw %%ymm1,%%ymm0,%%ymm0 \n" \ "vpmaddubsw %%ymm3,%%ymm10,%%ymm2 \n" \
"vpmaddubsw %%ymm9,%%ymm3,%%ymm1 \n" \ "vpaddw %%ymm4,%%ymm12,%%ymm4 \n" \
"vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
"vpand %%ymm3,%%ymm7,%%ymm3 \n" \ "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \
"vpaddw %%ymm3,%%ymm2,%%ymm2 \n" \ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpaddw %%ymm4,%%ymm0,%%ymm0 \n" \
"vpaddw %%ymm4,%%ymm12,%%ymm3 \n" \
"vpaddw %%ymm4,%%ymm2,%%ymm2 \n" \
"vpsubusw %%ymm11,%%ymm0,%%ymm0 \n" \
"vpsubusw %%ymm1,%%ymm3,%%ymm1 \n" \
"vpsubusw %%ymm13,%%ymm2,%%ymm2 \n"
#define YUVTORGB_REGS_AVX2 \ #define YUVTORGB_REGS_AVX2 \
"xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
#else // Convert 16 pixels: 16 UV and 16 Y. #else // Convert 16 pixels: 16 UV and 16 Y.
#define YUVTORGB_SETUP_AVX2(yuvconstants) #define YUVTORGB_SETUP_AVX2(yuvconstants)
#define YUVTORGB16_AVX2(yuvconstants) \ #define YUVTORGB16_AVX2(yuvconstants) \
"vpmaddubsw (%[yuvconstants]),%%ymm3,%%ymm0 \n" \ "vpcmpeqb %%xmm0,%%xmm0,%%xmm0 \n" \
"vpmaddubsw 64(%[yuvconstants]),%%ymm3,%%ymm2 \n" \ "vpsllw $7,%%xmm0,%%xmm0 \n" \
"vpsllw $8,%%ymm3,%%ymm1 \n" \ "vpbroadcastb %%xmm0,%%ymm0 \n" \
"vbroadcastf128 256(%[yuvconstants]),%%ymm7 \n" \ "vpsubb %%ymm0,%%ymm3,%%ymm3 \n" \
"vpand %%ymm7,%%ymm1,%%ymm1 \n" \ "vpmulhuw 96(%[yuvconstants]),%%ymm4,%%ymm4 \n" \
"vpaddw %%ymm1,%%ymm0,%%ymm0 \n" \ "vmovdqa (%[yuvconstants]),%%ymm0 \n" \
"vpmaddubsw 32(%[yuvconstants]),%%ymm3,%%ymm1 \n" \ "vmovdqa 32(%[yuvconstants]),%%ymm1 \n" \
"vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \ "vmovdqa 64(%[yuvconstants]),%%ymm2 \n" \
"vbroadcastf128 272(%[yuvconstants]),%%ymm7 \n" \ "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" \
"vpand %%ymm7,%%ymm3,%%ymm3 \n" \ "vpmaddubsw %%ymm3,%%ymm1,%%ymm1 \n" \
"vpaddw %%ymm3,%%ymm2,%%ymm2 \n" \ "vpmaddubsw %%ymm3,%%ymm2,%%ymm2 \n" \
"vpaddw %%ymm4,%%ymm0,%%ymm0 \n" \ "vmovdqa 128(%[yuvconstants]),%%ymm3 \n" \
"vmovdqu 128(%[yuvconstants]),%%ymm7 \n" \ "vpaddw %%ymm4,%%ymm3,%%ymm4 \n" \
"vpaddw %%ymm4,%%ymm7,%%ymm3 \n" \ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
"vpaddw %%ymm4,%%ymm2,%%ymm2 \n" \ "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \
"vmovdqu 96(%[yuvconstants]),%%ymm7 \n" \ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpsubusw %%ymm7,%%ymm0,%%ymm0 \n" \
"vpsubusw %%ymm1,%%ymm3,%%ymm1 \n" \
"vmovdqu 160(%[yuvconstants]),%%ymm7 \n" \
"vpsubusw %%ymm7,%%ymm2,%%ymm2 \n"
#define YUVTORGB_REGS_AVX2 "xmm7", #define YUVTORGB_REGS_AVX2
#endif #endif
#define YUVTORGB_AVX2(yuvconstants) \ #define YUVTORGB_AVX2(yuvconstants) \
YUVTORGB16_AVX2(yuvconstants) \ YUVTORGB16_AVX2(yuvconstants) \
"vpsrlw $0x6,%%ymm0,%%ymm0 \n" \ "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
"vpsrlw $0x6,%%ymm1,%%ymm1 \n" \ "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
"vpsrlw $0x6,%%ymm2,%%ymm2 \n" \ "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
"vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
"vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
@ -3438,16 +3422,19 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
"vmovdqu %%ymm1,(%[dst_argb]) \n" \ "vmovdqu %%ymm1,(%[dst_argb]) \n" \
"vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \ "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \
"lea 0x40(%[dst_argb]), %[dst_argb] \n" "lea 0x40(%[dst_argb]), %[dst_argb] \n"
// Store 16 AR30 values. // Store 16 AR30 values.
#define STOREAR30_AVX2 \ #define STOREAR30_AVX2 \
"vpsrlw $0x4,%%ymm0,%%ymm0 \n" \ "vpsraw $0x4,%%ymm0,%%ymm0 \n" \
"vpsrlw $0x4,%%ymm1,%%ymm1 \n" \ "vpsraw $0x4,%%ymm1,%%ymm1 \n" \
"vpsrlw $0x4,%%ymm2,%%ymm2 \n" \ "vpsraw $0x4,%%ymm2,%%ymm2 \n" \
"vpminuw %%ymm6,%%ymm0,%%ymm0 \n" \ "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \
"vpminuw %%ymm6,%%ymm1,%%ymm1 \n" \ "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \
"vpminuw %%ymm6,%%ymm2,%%ymm2 \n" \ "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \
"vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \
"vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \
"vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \
"vpsllw $0x4,%%ymm2,%%ymm2 \n" \ "vpsllw $0x4,%%ymm2,%%ymm2 \n" \
"vpermq $0xd8,%%ymm0,%%ymm0 \n" \ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
"vpermq $0xd8,%%ymm1,%%ymm1 \n" \ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
@ -3548,8 +3535,9 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
"vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsrlw $14,%%ymm5,%%ymm5 \n"
"vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
"vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 1023 for max "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
"vpsrlw $6,%%ymm6,%%ymm6 \n" "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
"vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -3567,7 +3555,7 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS_AVX2 : "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
); );
} }
#endif // HAS_I422TOAR30ROW_AVX2 #endif // HAS_I422TOAR30ROW_AVX2
@ -3657,8 +3645,9 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
"vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsrlw $14,%%ymm5,%%ymm5 \n"
"vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
"vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 1023 for max "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
"vpsrlw $6,%%ymm6,%%ymm6 \n" "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
"vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -3676,7 +3665,7 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS_AVX2 : "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
); );
} }
#endif // HAS_I210TOAR30ROW_AVX2 #endif // HAS_I210TOAR30ROW_AVX2
@ -3696,8 +3685,9 @@ void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
"vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsrlw $14,%%ymm5,%%ymm5 \n"
"vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
"vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 1023 for max "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
"vpsrlw $6,%%ymm6,%%ymm6 \n" "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
"vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -3715,7 +3705,7 @@ void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS_AVX2 : "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
); );
} }
#endif // HAS_I212TOAR30ROW_AVX2 #endif // HAS_I212TOAR30ROW_AVX2
@ -3842,8 +3832,9 @@ void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
"vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsrlw $14,%%ymm5,%%ymm5 \n"
"vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
"vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 1023 for max "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
"vpsrlw $6,%%ymm6,%%ymm6 \n" "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
"vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -3861,7 +3852,7 @@ void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS_AVX2 : "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
); );
} }
#endif // HAS_I410TOAR30ROW_AVX2 #endif // HAS_I410TOAR30ROW_AVX2
@ -4204,8 +4195,9 @@ void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf,
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
"vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsrlw $14,%%ymm5,%%ymm5 \n"
"vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
"vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 1023 for max "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
"vpsrlw $6,%%ymm6,%%ymm6 \n" "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
"vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -4240,8 +4232,9 @@ void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf,
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
"vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsrlw $14,%%ymm5,%%ymm5 \n"
"vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
"vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 1023 for max "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
"vpsrlw $6,%%ymm6,%%ymm6 \n" "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
"vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -4269,8 +4262,8 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile( asm volatile(
"movdqa 192(%3),%%xmm2 \n" // yg = 18997 = 1.164 "movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164
"movdqa 224(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16 "movdqa 128(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16
"pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000 "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000
"pslld $0x18,%%xmm4 \n" "pslld $0x18,%%xmm4 \n"
@ -4314,8 +4307,8 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile( asm volatile(
"vmovdqa 192(%3),%%ymm2 \n" // yg = 18997 = 1.164 "vmovdqa 96(%3),%%ymm2 \n" // yg = 18997 = 1.164
"vmovdqa 224(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16 "vmovdqa 128(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000
"vpslld $0x18,%%ymm4,%%ymm4 \n" "vpslld $0x18,%%ymm4,%%ymm4 \n"

View File

@ -75,28 +75,18 @@ extern "C" {
// Convert 8 pixels: 8 UV and 8 Y. // Convert 8 pixels: 8 UV and 8 Y.
#define YUVTORGB(yuvconstants) \ #define YUVTORGB(yuvconstants) \
xmm0 = _mm_loadu_si128(&xmm3); \ xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8(0x80)); \
xmm1 = _mm_loadu_si128(&xmm3); \
xmm2 = _mm_loadu_si128(&xmm3); \
xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
xmm1 = _mm_slli_epi16(xmm1, 8); \
xmm1 = _mm_and_si128(xmm1, *(__m128i*)yuvconstants->kUVMaskBR); \
xmm0 = _mm_add_epi16(xmm0, xmm1); \
xmm1 = _mm_loadu_si128(&xmm3); \
xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
xmm3 = _mm_and_si128(xmm3, *((__m128i*)(yuvconstants->kUVMaskBR) + 1)); \ xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \
xmm2 = _mm_add_epi16(xmm2, xmm3); \ xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3); \
xmm0 = _mm_add_epi16(xmm0, xmm4); \ xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3); \
xmm2 = _mm_add_epi16(xmm2, xmm4); \ xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3); \
xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kUVBiasG); \ xmm0 = _mm_adds_epi16(xmm4, xmm0); \
xmm0 = _mm_subs_epu16(xmm0, *(__m128i*)yuvconstants->kUVBiasB); \ xmm1 = _mm_subs_epi16(xmm4, xmm1); \
xmm1 = _mm_subs_epu16(xmm4, xmm1); \ xmm2 = _mm_adds_epi16(xmm4, xmm2); \
xmm2 = _mm_subs_epu16(xmm2, *(__m128i*)yuvconstants->kUVBiasR); \ xmm0 = _mm_srai_epi16(xmm0, 6); \
xmm0 = _mm_srli_epi16(xmm0, 6); \ xmm1 = _mm_srai_epi16(xmm1, 6); \
xmm1 = _mm_srli_epi16(xmm1, 6); \ xmm2 = _mm_srai_epi16(xmm2, 6); \
xmm2 = _mm_srli_epi16(xmm2, 6); \
xmm0 = _mm_packus_epi16(xmm0, xmm0); \ xmm0 = _mm_packus_epi16(xmm0, xmm0); \
xmm1 = _mm_packus_epi16(xmm1, xmm1); \ xmm1 = _mm_packus_epi16(xmm1, xmm1); \
xmm2 = _mm_packus_epi16(xmm2, xmm2); xmm2 = _mm_packus_epi16(xmm2, xmm2);
@ -254,11 +244,11 @@ static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
// 7 bit fixed point 0.5. // 7 bit fixed point 0.5.
static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, // 8 bit fixed point 0.5, for bias of UV.
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; static const ulvec8 kBiasUV128 = {
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x8080u, 0x8080u, 0x8080u, 0x8080u}; 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
// Shuffle table for converting RGB24 to ARGB. // Shuffle table for converting RGB24 to ARGB.
static const uvec8 kShuffleMaskRGB24ToARGB = { static const uvec8 kShuffleMaskRGB24ToARGB = {
@ -1447,7 +1437,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
mov edx, [esp + 8 + 12] // dst_u mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kARGBToV movdqa xmm6, xmmword ptr kARGBToV
movdqa xmm7, xmmword ptr kARGBToU movdqa xmm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
@ -1519,7 +1509,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
mov edx, [esp + 8 + 12] // dst_u mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
movdqa xmm5, xmmword ptr kAddUVJ128 movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kARGBToVJ movdqa xmm6, xmmword ptr kARGBToVJ
movdqa xmm7, xmmword ptr kARGBToUJ movdqa xmm7, xmmword ptr kARGBToUJ
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
@ -1593,7 +1583,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb,
mov edx, [esp + 8 + 12] // dst_u mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
vbroadcastf128 ymm5, xmmword ptr kAddUV128 vbroadcastf128 ymm5, xmmword ptr kBiasUV128
vbroadcastf128 ymm6, xmmword ptr kARGBToV vbroadcastf128 ymm6, xmmword ptr kARGBToV
vbroadcastf128 ymm7, xmmword ptr kARGBToU vbroadcastf128 ymm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
@ -1661,7 +1651,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
mov edx, [esp + 8 + 12] // dst_u mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
vbroadcastf128 ymm5, xmmword ptr kAddUVJ128 vbroadcastf128 ymm5, xmmword ptr kBiasUV128
vbroadcastf128 ymm6, xmmword ptr kARGBToVJ vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
vbroadcastf128 ymm7, xmmword ptr kARGBToUJ vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
@ -1726,7 +1716,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
mov edx, [esp + 4 + 8] // dst_u mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kARGBToV movdqa xmm6, xmmword ptr kARGBToV
movdqa xmm7, xmmword ptr kARGBToU movdqa xmm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
@ -1787,7 +1777,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb,
mov edx, [esp + 8 + 12] // dst_u mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kBGRAToV movdqa xmm6, xmmword ptr kBGRAToV
movdqa xmm7, xmmword ptr kBGRAToU movdqa xmm7, xmmword ptr kBGRAToU
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
@ -1859,7 +1849,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb,
mov edx, [esp + 8 + 12] // dst_u mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kABGRToV movdqa xmm6, xmmword ptr kABGRToV
movdqa xmm7, xmmword ptr kABGRToU movdqa xmm7, xmmword ptr kABGRToU
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
@ -1931,7 +1921,7 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb,
mov edx, [esp + 8 + 12] // dst_u mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kRGBAToV movdqa xmm6, xmmword ptr kRGBAToV
movdqa xmm7, xmmword ptr kRGBAToU movdqa xmm7, xmmword ptr kRGBAToU
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
@ -2097,33 +2087,26 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb,
// Convert 16 pixels: 16 UV and 16 Y. // Convert 16 pixels: 16 UV and 16 Y.
#define YUVTORGB_AVX2(YuvConstants) \ #define YUVTORGB_AVX2(YuvConstants) \
__asm { \ __asm { \
__asm vpmaddubsw ymm0, ymm3, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\ __asm vpsubb ymm3, ymm3, ymmword ptr kBiasUV128 \
__asm vpmaddubsw ymm2, ymm3, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
__asm vpsllw ymm1, ymm3, 8 \
__asm vbroadcastf128 ymm6, xmmword ptr [YuvConstants + KUMASKB] \
__asm vpand ymm1, ymm1, ymm6 \
__asm vpaddw ymm0, ymm0, ymm1 \
__asm vpmaddubsw ymm1, ymm3, ymmword ptr [YuvConstants + KUVTOG] /* B UV */\
__asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
__asm vbroadcastf128 ymm6, xmmword ptr [YuvConstants + KVMASKR] \ __asm vmovdqa ymm0, ymmword ptr [YuvConstants + KUVTOB] \
__asm vpand ymm3, ymm3, ymm6 \ __asm vmovdqa ymm1, ymmword ptr [YuvConstants + KUVTOG] \
__asm vpaddw ymm2, ymm2, ymm3 \ __asm vmovdqa ymm2, ymmword ptr [YuvConstants + KUVTOR] \
__asm vpaddw ymm0, ymm0, ymm4 \ __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */ \
__asm vmovdqu ymm6, ymmword ptr [YuvConstants + KUVBIASG] \ __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */ \
__asm vpaddw ymm3, ymm4, ymm6 \ __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */ \
__asm vpaddw ymm2, ymm2, ymm4 \ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KYBIASTORGB] \
__asm vmovdqu ymm6, ymmword ptr [YuvConstants + KUVBIASB] \ __asm vpaddw ymm4, ymm3, ymm4 \
__asm vpsubusw ymm0, ymm0, ymm6 \ __asm vpaddsw ymm0, ymm0, ymm4 \
__asm vpsubusw ymm1, ymm3, ymm1 \ __asm vpsubsw ymm1, ymm4, ymm1 \
__asm vmovdqu ymm6, ymmword ptr [YuvConstants + KUVBIASR] \ __asm vpaddsw ymm2, ymm2, ymm4 \
__asm vpsubusw ymm2, ymm2, ymm6 \ __asm vpsraw ymm0, ymm0, 6 \
__asm vpsrlw ymm0, ymm0, 6 \ __asm vpsraw ymm1, ymm1, 6 \
__asm vpsrlw ymm1, ymm1, 6 \ __asm vpsraw ymm2, ymm2, 6 \
__asm vpsrlw ymm2, ymm2, 6 \ __asm vpackuswb ymm0, ymm0, ymm0 \
__asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ __asm vpackuswb ymm1, ymm1, ymm1 \
__asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ __asm vpackuswb ymm2, ymm2, ymm2 \
__asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
} }
// Store 16 ARGB values. // Store 16 ARGB values.
@ -2582,34 +2565,27 @@ __declspec(naked) void I422ToRGBARow_AVX2(
// Convert 8 pixels: 8 UV and 8 Y. // Convert 8 pixels: 8 UV and 8 Y.
#define YUVTORGB(YuvConstants) \ #define YUVTORGB(YuvConstants) \
__asm { \ __asm { \
__asm movdqa xmm0, xmm3 \ __asm psubb xmm3, xmmword ptr kBiasUV128 \
__asm movdqa xmm1, xmm3 \
__asm movdqa xmm2, xmm3 \
__asm pmaddubsw xmm0, xmmword ptr [YuvConstants + KUVTOB] \
__asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOR] \
__asm psllw xmm1, 8 \
__asm pand xmm1, xmmword ptr [YuvConstants + KUMASKB] \
__asm paddw xmm0, xmm1 \
__asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOG] \
__asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
__asm pand xmm3, xmmword ptr [YuvConstants + KVMASKR] \ __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVTOB] \
__asm paddw xmm0, xmm4 \ __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVTOG] \
__asm movdqa xmm6, xmmword ptr [YuvConstants + KUVBIASG] \ __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVTOR] \
__asm paddw xmm2, xmm4 \ __asm pmaddubsw xmm0, xmm3 \
__asm paddw xmm4, xmm6 \ __asm pmaddubsw xmm1, xmm3 \
__asm movdqa xmm6, xmmword ptr [YuvConstants + KUVBIASG] \ __asm pmaddubsw xmm2, xmm3 \
__asm psubusw xmm0, xmm6 \ __asm movdqa xmm3, xmmword ptr [YuvConstants + KYBIASTORGB] \
__asm psubusw xmm4, xmm1 \ __asm paddw xmm4, xmm3 \
__asm movdqa xmm6, xmmword ptr [YuvConstants + KUVBIASG] \ __asm paddsw xmm0, xmm4 \
__asm psubusw xmm2, xmm6 \ __asm paddsw xmm2, xmm4 \
__asm psubsw xmm4, xmm1 \
__asm movdqa xmm1, xmm4 \ __asm movdqa xmm1, xmm4 \
__asm psrlw xmm0, 6 \ __asm psraw xmm0, 6 \
__asm psrlw xmm1, 6 \ __asm psraw xmm1, 6 \
__asm psrlw xmm2, 6 \ __asm psraw xmm2, 6 \
__asm packuswb xmm0, xmm0 /* B */ \ __asm packuswb xmm0, xmm0 /* B */ \
__asm packuswb xmm1, xmm1 /* G */ \ __asm packuswb xmm1, xmm1 /* G */ \
__asm packuswb xmm2, xmm2 /* R */ \ __asm packuswb xmm2, xmm2 /* R */ \
} }
// Store 8 ARGB values. // Store 8 ARGB values.

View File

@ -11,7 +11,7 @@
#ifndef UNIT_TEST_UNIT_TEST_H_ // NOLINT #ifndef UNIT_TEST_UNIT_TEST_H_ // NOLINT
#define UNIT_TEST_UNIT_TEST_H_ #define UNIT_TEST_UNIT_TEST_H_
#ifdef WIN32 #ifdef _WIN32
#include <windows.h> #include <windows.h>
#else #else
#include <sys/resource.h> #include <sys/resource.h>