Optimize unlimited data for Intel

Use unsigned coefficient and signed UV value in YUVTORGB.

R=fbarchard@chromium.org

Bug: libyuv:862, libyuv:863
Change-Id: I32e58b2cee383fb98104c055beb0867a7ad05bfe
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2850016
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Yuan Tong 2021-04-27 22:47:36 +08:00 committed by Frank Barchard
parent 5e05f26a2b
commit c9843de02a
5 changed files with 300 additions and 335 deletions

View File

@ -730,25 +730,16 @@ struct YuvConstants {
uint8_t kUVToB[32];
uint8_t kUVToG[32];
uint8_t kUVToR[32];
int16_t kUVBiasB[16];
int16_t kUVBiasG[16];
int16_t kUVBiasR[16];
int16_t kYToRgb[16];
int16_t kYBiasToRgb[16];
uint8_t kUVMaskBR[32];
};
// Offsets into YuvConstants structure
#define KUVTOB 0
#define KUVTOG 32
#define KUVTOR 64
#define KUVBIASB 96
#define KUVBIASG 128
#define KUVBIASR 160
#define KYTORGB 192
#define KYBIASTORGB 224
#define KUMASKB 256
#define KVMASKR 272
#define KYTORGB 96
#define KYBIASTORGB 128
#endif

View File

@ -55,8 +55,8 @@ static __inline int32_t clamp1023(int32_t v) {
return (-(v >= 1023) | v) & 1023;
}
// clamp to 2^n - 1
static __inline int32_t clamp2nm1(int32_t v, int32_t max) {
// clamp to max
static __inline int32_t ClampMax(int32_t v, int32_t max) {
return (-(v >= max) | v) & max;
}
@ -77,7 +77,7 @@ static __inline int32_t clamp1023(int32_t v) {
return (v > 1023) ? 1023 : v;
}
static __inline int32_t clamp2nm1(int32_t v, int32_t max) {
static __inline int32_t ClampMax(int32_t v, int32_t max) {
return (v > max) ? max : v;
}
@ -1422,46 +1422,37 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
// clang-format off
#if defined(__aarch64__) || defined(__arm__)
#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR, BB, BG, BR) \
// Bias values to round, and subtract 128 from U and V.
// For B and R this is negative. For G this is positive.
#define BB (UB * 128 - YB)
#define BG (UG * 128 + VG * 128 + YB)
#define BR (VR * 128 - YB)
#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \
{{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \
{YG, BB, BG, BR, YB, 0, 0, 0}}
#else
#define UVMASK(C) ((C) > 127 ? 0xff : 0)
#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR, BB, BG, BR) \
#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \
{{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, \
UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, \
{UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, \
UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
{0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, \
0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, \
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, \
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, \
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, \
{YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
{YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}, \
{0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB), \
0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB), \
0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR), \
0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR)}}
{YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}
#endif
// clang-format on
#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR) \
#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR) \
const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \
YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR, BB, BG, BR); \
YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR); \
const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \
YUBCONSTANTSBODY(YG, YB, VR, VG, UG, UB, BR, BG, BB);
YUBCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
// TODO(fbarchard): Generate SIMD structures from float matrix.
// Bias values to round, and subtract 128 from U and V.
// For B and R this is negative. For G this is positive.
#define BB (UB * 128 - YB)
#define BG (UG * 128 + VG * 128 + YB)
#define BR (VR * 128 - YB)
// BT.601 limited range YUV to RGB reference
// R = (Y - 16) * 1.164 + V * 1.596
// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
@ -1482,7 +1473,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR, BB, BG, BR)
MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR)
#undef YG
#undef YB
@ -1507,7 +1498,7 @@ MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR, BB, BG, BR)
#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
#define YB 32 /* 64 / 2 */
MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR)
MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR)
#undef YG
#undef YB
@ -1536,7 +1527,7 @@ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR)
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR)
#undef YG
#undef YB
@ -1561,7 +1552,7 @@ MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
#define YB 32 /* 64 / 2 */
MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR)
#undef YG
#undef YB
@ -1590,7 +1581,7 @@ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
#define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR)
#undef YG
#undef YB
@ -1614,7 +1605,7 @@ MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
#define YB 32 /* 64 / 2 */
MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR)
#undef YG
#undef YB
@ -1631,24 +1622,38 @@ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
#if defined(__aarch64__) || defined(__arm__)
#define LOAD_YUV_CONSTANTS \
int ub = -yuvconstants->kUVCoeff[0]; \
int vr = -yuvconstants->kUVCoeff[1]; \
int ub = yuvconstants->kUVCoeff[0]; \
int vr = yuvconstants->kUVCoeff[1]; \
int ug = yuvconstants->kUVCoeff[2]; \
int vg = yuvconstants->kUVCoeff[3]; \
int yg = yuvconstants->kRGBCoeffBias[0]; \
int bb = -yuvconstants->kRGBCoeffBias[1]; \
int bb = yuvconstants->kRGBCoeffBias[1]; \
int bg = yuvconstants->kRGBCoeffBias[2]; \
int br = -yuvconstants->kRGBCoeffBias[3]
int br = yuvconstants->kRGBCoeffBias[3]
#define CALC_RGB16 \
int32_t y1 = (uint32_t)(y32 * yg) >> 16; \
int b16 = y1 + (u * ub) - bb; \
int g16 = y1 + bg - (u * ug + v * vg); \
int r16 = y1 + (v * vr) - br
#else
#define LOAD_YUV_CONSTANTS \
int ub = -yuvconstants->kUVToB[0]; \
int ub = yuvconstants->kUVToB[0]; \
int ug = yuvconstants->kUVToG[0]; \
int vg = yuvconstants->kUVToG[1]; \
int vr = -yuvconstants->kUVToR[1]; \
int bb = -yuvconstants->kUVBiasB[0]; \
int bg = yuvconstants->kUVBiasG[0]; \
int br = -yuvconstants->kUVBiasR[0]; \
int yg = yuvconstants->kYToRgb[0]
int vr = yuvconstants->kUVToR[1]; \
int yg = yuvconstants->kYToRgb[0]; \
int yb = yuvconstants->kYBiasToRgb[0]
#define CALC_RGB16 \
int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \
int8_t ui = u; \
int8_t vi = v; \
ui -= 0x80; \
vi -= 0x80; \
int b16 = y1 + (ui * ub); \
int g16 = y1 - (ui * ug + vi * vg); \
int r16 = y1 + (vi * vr)
#endif
// C reference code that mimics the YUV assembly.
@ -1661,11 +1666,11 @@ static __inline void YuvPixel(uint8_t y,
uint8_t* r,
const struct YuvConstants* yuvconstants) {
LOAD_YUV_CONSTANTS;
uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
*b = Clamp((int32_t)(y1 - (u * ub) + bb) >> 6);
*g = Clamp((int32_t)(y1 - (u * ug + v * vg) + bg) >> 6);
*r = Clamp((int32_t)(y1 - (v * vr) + br) >> 6);
uint32_t y32 = y * 0x0101;
CALC_RGB16;
*b = Clamp((int32_t)(b16) >> 6);
*g = Clamp((int32_t)(g16) >> 6);
*r = Clamp((int32_t)(r16) >> 6);
}
// Reads 8 bit YUV and leaves result as 16 bit.
@ -1677,11 +1682,11 @@ static __inline void YuvPixel8_16(uint8_t y,
int* r,
const struct YuvConstants* yuvconstants) {
LOAD_YUV_CONSTANTS;
uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
*b = (int)(y1 - (u * ub) + bb);
*g = (int)(y1 - (u * ug + v * vg) + bg);
*r = (int)(y1 - (v * vr) + br);
uint32_t y32 = y * 0x0101;
CALC_RGB16;
*b = b16;
*g = g16;
*r = r16;
}
// C reference code that mimics the YUV 16 bit assembly.
@ -1694,13 +1699,13 @@ static __inline void YuvPixel10_16(uint16_t y,
int* r,
const struct YuvConstants* yuvconstants) {
LOAD_YUV_CONSTANTS;
uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16;
uint32_t y32 = y << 6;
u = clamp255(u >> 2);
v = clamp255(v >> 2);
*b = (int)(-(u * ub) + y1 + bb);
*g = (int)(-(u * ug + v * vg) + y1 + bg);
*r = (int)(-(v * vr) + y1 + br);
CALC_RGB16;
*b = b16;
*g = g16;
*r = r16;
}
// C reference code that mimics the YUV 16 bit assembly.
@ -1713,13 +1718,13 @@ static __inline void YuvPixel12_16(int16_t y,
int* r,
const struct YuvConstants* yuvconstants) {
LOAD_YUV_CONSTANTS;
uint32_t y1 = (uint32_t)((y << 4) * yg) >> 16;
uint32_t y32 = y << 4;
u = clamp255(u >> 4);
v = clamp255(v >> 4);
*b = (int)(-(u * ub) + y1 + bb);
*g = (int)(-(u * ug + v * vg) + y1 + bg);
*r = (int)(-(v * vr) + y1 + br);
CALC_RGB16;
*b = b16;
*g = g16;
*r = r16;
}
// C reference code that mimics the YUV 10 bit assembly.
@ -1768,13 +1773,13 @@ static __inline void YuvPixel16_8(uint16_t y,
uint8_t* r,
const struct YuvConstants* yuvconstants) {
LOAD_YUV_CONSTANTS;
uint32_t y1 = (uint32_t)(y * yg) >> 16;
uint32_t y32 = y;
u = clamp255(u >> 8);
v = clamp255(v >> 8);
*b = Clamp((int32_t)(y1 + -(u * ub) + bb) >> 6);
*g = Clamp((int32_t)(y1 + -(u * ug + v * vg) + bg) >> 6);
*r = Clamp((int32_t)(y1 + -(v * vr) + br) >> 6);
CALC_RGB16;
*b = Clamp((int32_t)(b16) >> 6);
*g = Clamp((int32_t)(g16) >> 6);
*r = Clamp((int32_t)(r16) >> 6);
}
// C reference code that mimics the YUV 16 bit assembly.
@ -1787,13 +1792,13 @@ static __inline void YuvPixel16_16(uint16_t y,
int* r,
const struct YuvConstants* yuvconstants) {
LOAD_YUV_CONSTANTS;
uint32_t y1 = (uint32_t)(y * yg) >> 16;
uint32_t y32 = y;
u = clamp255(u >> 8);
v = clamp255(v >> 8);
*b = (int)(y1 + -(u * ub) + bb);
*g = (int)(y1 + -(u * ug + v * vg) + bg);
*r = (int)(y1 + -(v * vr) + br);
CALC_RGB16;
*b = b16;
*g = g16;
*r = r16;
}
// C reference code that mimics the YUV assembly.
@ -2779,10 +2784,10 @@ void MergeAR64Row_C(const uint16_t* src_r,
int shift = 16 - depth;
int max = (1 << depth) - 1;
for (x = 0; x < width; ++x) {
dst_ar64[0] = clamp2nm1(src_b[x], max) << shift;
dst_ar64[1] = clamp2nm1(src_g[x], max) << shift;
dst_ar64[2] = clamp2nm1(src_r[x], max) << shift;
dst_ar64[3] = clamp2nm1(src_a[x], max) << shift;
dst_ar64[0] = ClampMax(src_b[x], max) << shift;
dst_ar64[1] = ClampMax(src_g[x], max) << shift;
dst_ar64[2] = ClampMax(src_r[x], max) << shift;
dst_ar64[3] = ClampMax(src_a[x], max) << shift;
dst_ar64 += 4;
}
}
@ -2819,9 +2824,9 @@ void MergeXR64Row_C(const uint16_t* src_r,
int shift = 16 - depth;
int max = (1 << depth) - 1;
for (x = 0; x < width; ++x) {
dst_ar64[0] = clamp2nm1(src_b[x], max) << shift;
dst_ar64[1] = clamp2nm1(src_g[x], max) << shift;
dst_ar64[2] = clamp2nm1(src_r[x], max) << shift;
dst_ar64[0] = ClampMax(src_b[x], max) << shift;
dst_ar64[1] = ClampMax(src_g[x], max) << shift;
dst_ar64[2] = ClampMax(src_r[x], max) << shift;
dst_ar64[3] = 0xffff;
dst_ar64 += 4;
}

View File

@ -2312,78 +2312,65 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
#if defined(__x86_64__)
#define YUVTORGB_SETUP(yuvconstants) \
"pcmpeqb %%xmm13,%%xmm13 \n" \
"movdqa (%[yuvconstants]),%%xmm8 \n" \
"pxor %%xmm12,%%xmm12 \n" \
"movdqa 32(%[yuvconstants]),%%xmm9 \n" \
"psllw $7,%%xmm13 \n" \
"movdqa 64(%[yuvconstants]),%%xmm10 \n" \
"pshufb %%xmm12,%%xmm13 \n" \
"movdqa 96(%[yuvconstants]),%%xmm11 \n" \
"movdqa 128(%[yuvconstants]),%%xmm12 \n" \
"movdqa 160(%[yuvconstants]),%%xmm13 \n" \
"movdqa 192(%[yuvconstants]),%%xmm14 \n" \
"movdqa 256(%[yuvconstants]),%%xmm15 \n" \
"movdqa 272(%[yuvconstants]),%%xmm7 \n"
"movdqa 128(%[yuvconstants]),%%xmm12 \n"
// Convert 8 pixels: 8 UV and 8 Y
#define YUVTORGB16(yuvconstants) \
"movdqa %%xmm3,%%xmm0 \n" \
"movdqa %%xmm3,%%xmm1 \n" \
"movdqa %%xmm3,%%xmm2 \n" \
"pmaddubsw %%xmm8,%%xmm0 \n" \
"pmaddubsw %%xmm10,%%xmm2 \n" \
"psllw $8,%%xmm1 \n" \
"pand %%xmm15,%%xmm1 \n" \
"paddw %%xmm1,%%xmm0 \n" \
"movdqa %%xmm3,%%xmm1 \n" \
"pmaddubsw %%xmm9,%%xmm1 \n" \
"pmulhuw %%xmm14,%%xmm4 \n" \
"pand %%xmm7,%%xmm3 \n" \
"paddw %%xmm3,%%xmm2 \n" \
"paddw %%xmm4,%%xmm0 \n" \
"paddw %%xmm4,%%xmm2 \n" \
"psubb %%xmm13,%%xmm3 \n" \
"pmulhuw %%xmm11,%%xmm4 \n" \
"movdqa %%xmm8,%%xmm0 \n" \
"movdqa %%xmm9,%%xmm1 \n" \
"movdqa %%xmm10,%%xmm2 \n" \
"paddw %%xmm12,%%xmm4 \n" \
"psubusw %%xmm11,%%xmm0 \n" \
"psubusw %%xmm1,%%xmm4 \n" \
"psubusw %%xmm13,%%xmm2 \n" \
"pmaddubsw %%xmm3,%%xmm0 \n" \
"pmaddubsw %%xmm3,%%xmm1 \n" \
"pmaddubsw %%xmm3,%%xmm2 \n" \
"paddsw %%xmm4,%%xmm0 \n" \
"paddsw %%xmm4,%%xmm2 \n" \
"psubsw %%xmm1,%%xmm4 \n" \
"movdqa %%xmm4,%%xmm1 \n"
#define YUVTORGB_REGS \
"xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
#define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
#else
#define YUVTORGB_SETUP(yuvconstants)
// Convert 8 pixels: 8 UV and 8 Y
#define YUVTORGB16(yuvconstants) \
"movdqa %%xmm3,%%xmm0 \n" \
"movdqa %%xmm3,%%xmm1 \n" \
"movdqa %%xmm3,%%xmm2 \n" \
"pmaddubsw (%[yuvconstants]),%%xmm0 \n" \
"pmaddubsw 64(%[yuvconstants]),%%xmm2 \n" \
"psllw $8,%%xmm1 \n" \
"pand 256(%[yuvconstants]),%%xmm1 \n" \
"paddw %%xmm1,%%xmm0 \n" \
"movdqa %%xmm3,%%xmm1 \n" \
"pmaddubsw 32(%[yuvconstants]),%%xmm1 \n" \
"pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \
"pand 272(%[yuvconstants]),%%xmm3 \n" \
"paddw %%xmm3,%%xmm2 \n" \
"movdqa 128(%[yuvconstants]),%%xmm7 \n" \
"paddw %%xmm4,%%xmm0 \n" \
"paddw %%xmm4,%%xmm2 \n" \
"paddw %%xmm7,%%xmm4 \n" \
"movdqa 96(%[yuvconstants]),%%xmm7 \n" \
"psubusw %%xmm7,%%xmm0 \n" \
"psubusw %%xmm1,%%xmm4 \n" \
"movdqa 160(%[yuvconstants]),%%xmm7 \n" \
"psubusw %%xmm7,%%xmm2 \n" \
"movdqa %%xmm4,%%xmm1 \n" \
"pcmpeqb %%xmm0,%%xmm0 \n" \
"pxor %%xmm1,%%xmm1 \n" \
"psllw $7,%%xmm0 \n" \
"pshufb %%xmm1,%%xmm0 \n" \
"psubb %%xmm0,%%xmm3 \n" \
"pmulhuw 96(%[yuvconstants]),%%xmm4 \n" \
"movdqa (%[yuvconstants]),%%xmm0 \n" \
"movdqa 32(%[yuvconstants]),%%xmm1 \n" \
"movdqa 64(%[yuvconstants]),%%xmm2 \n" \
"pmaddubsw %%xmm3,%%xmm0 \n" \
"pmaddubsw %%xmm3,%%xmm1 \n" \
"pmaddubsw %%xmm3,%%xmm2 \n" \
"movdqa 128(%[yuvconstants]),%%xmm3 \n" \
"paddw %%xmm3,%%xmm4 \n" \
"paddsw %%xmm4,%%xmm0 \n" \
"paddsw %%xmm4,%%xmm2 \n" \
"psubsw %%xmm1,%%xmm4 \n" \
"movdqa %%xmm4,%%xmm1 \n"
#define YUVTORGB_REGS "xmm7",
#define YUVTORGB_REGS
#endif
#define YUVTORGB(yuvconstants) \
YUVTORGB16(yuvconstants) \
"psrlw $0x6,%%xmm0 \n" \
"psrlw $0x6,%%xmm1 \n" \
"psrlw $0x6,%%xmm2 \n" \
"psraw $0x6,%%xmm0 \n" \
"psraw $0x6,%%xmm1 \n" \
"psraw $0x6,%%xmm2 \n" \
"packuswb %%xmm0,%%xmm0 \n" \
"packuswb %%xmm1,%%xmm1 \n" \
"packuswb %%xmm2,%%xmm2 \n"
@ -2416,9 +2403,12 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
"psraw $0x4,%%xmm0 \n" \
"psraw $0x4,%%xmm1 \n" \
"psraw $0x4,%%xmm2 \n" \
"pminuw %%xmm6,%%xmm0 \n" \
"pminuw %%xmm6,%%xmm1 \n" \
"pminuw %%xmm6,%%xmm2 \n" \
"pminsw %%xmm7,%%xmm0 \n" \
"pminsw %%xmm7,%%xmm1 \n" \
"pminsw %%xmm7,%%xmm2 \n" \
"pmaxsw %%xmm6,%%xmm0 \n" \
"pmaxsw %%xmm6,%%xmm1 \n" \
"pmaxsw %%xmm6,%%xmm2 \n" \
"psllw $0x4,%%xmm2 \n" \
"movdqa %%xmm0,%%xmm3 \n" \
"punpcklwd %%xmm2,%%xmm0 \n" \
@ -2588,8 +2578,9 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
"pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
"psrlw $14,%%xmm5 \n"
"psllw $4,%%xmm5 \n" // 2 alpha bits
"pcmpeqb %%xmm6,%%xmm6 \n"
"psrlw $6,%%xmm6 \n" // 1023 for max
"pxor %%xmm6,%%xmm6 \n" // 0 for min
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
"1: \n"
@ -2605,7 +2596,7 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
@ -2682,8 +2673,9 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $14,%%xmm5 \n"
"psllw $4,%%xmm5 \n" // 2 alpha bits
"pcmpeqb %%xmm6,%%xmm6 \n"
"psrlw $6,%%xmm6 \n" // 1023 for max
"pxor %%xmm6,%%xmm6 \n" // 0 for min
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
"1: \n"
@ -2699,7 +2691,7 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
@ -2716,8 +2708,9 @@ void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $14,%%xmm5 \n"
"psllw $4,%%xmm5 \n" // 2 alpha bits
"pcmpeqb %%xmm6,%%xmm6 \n"
"psrlw $6,%%xmm6 \n" // 1023 for max
"pxor %%xmm6,%%xmm6 \n" // 0 for min
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
"1: \n"
@ -2733,7 +2726,7 @@ void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
@ -2850,8 +2843,9 @@ void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $14,%%xmm5 \n"
"psllw $4,%%xmm5 \n" // 2 alpha bits
"pcmpeqb %%xmm6,%%xmm6 \n"
"psrlw $6,%%xmm6 \n" // 1023 for max
"pxor %%xmm6,%%xmm6 \n" // 0 for min
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
"1: \n"
@ -2867,7 +2861,7 @@ void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
@ -3076,8 +3070,9 @@ void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $14,%%xmm5 \n"
"psllw $4,%%xmm5 \n" // 2 alpha bits
"pcmpeqb %%xmm6,%%xmm6 \n"
"psrlw $6,%%xmm6 \n" // 1023 for max
"pxor %%xmm6,%%xmm6 \n" // 0 for min
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
"1: \n"
@ -3092,7 +3087,7 @@ void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
@ -3106,8 +3101,9 @@ void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $14,%%xmm5 \n"
"psllw $4,%%xmm5 \n" // 2 alpha bits
"pcmpeqb %%xmm6,%%xmm6 \n"
"psrlw $6,%%xmm6 \n" // 1023 for max
"pxor %%xmm6,%%xmm6 \n" // 0 for min
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
"1: \n"
@ -3122,7 +3118,7 @@ void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
@ -3360,70 +3356,58 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
#if defined(__x86_64__)
#define YUVTORGB_SETUP_AVX2(yuvconstants) \
"vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \
"vmovdqa (%[yuvconstants]),%%ymm8 \n" \
"vpsllw $7,%%xmm13,%%xmm13 \n" \
"vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \
"vpbroadcastb %%xmm13,%%ymm13 \n" \
"vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \
"vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
"vmovdqa 128(%[yuvconstants]),%%ymm12 \n" \
"vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \
"vmovdqa 192(%[yuvconstants]),%%ymm14 \n" \
"vbroadcastf128 256(%[yuvconstants]),%%ymm15 \n" \
"vbroadcastf128 272(%[yuvconstants]),%%ymm7 \n"
"vmovdqa 128(%[yuvconstants]),%%ymm12 \n"
// TODO(yuan): Consider signed UV and unsigned coefficient for vpmaddubsw.
#define YUVTORGB16_AVX2(yuvconstants) \
"vpmaddubsw %%ymm8,%%ymm3,%%ymm0 \n" \
"vpmaddubsw %%ymm10,%%ymm3,%%ymm2 \n" \
"vpsllw $8,%%ymm3,%%ymm1 \n" \
"vpand %%ymm1,%%ymm15,%%ymm1 \n" \
"vpaddw %%ymm1,%%ymm0,%%ymm0 \n" \
"vpmaddubsw %%ymm9,%%ymm3,%%ymm1 \n" \
"vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
"vpand %%ymm3,%%ymm7,%%ymm3 \n" \
"vpaddw %%ymm3,%%ymm2,%%ymm2 \n" \
"vpaddw %%ymm4,%%ymm0,%%ymm0 \n" \
"vpaddw %%ymm4,%%ymm12,%%ymm3 \n" \
"vpaddw %%ymm4,%%ymm2,%%ymm2 \n" \
"vpsubusw %%ymm11,%%ymm0,%%ymm0 \n" \
"vpsubusw %%ymm1,%%ymm3,%%ymm1 \n" \
"vpsubusw %%ymm13,%%ymm2,%%ymm2 \n"
"vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \
"vpmulhuw %%ymm11,%%ymm4,%%ymm4 \n" \
"vpmaddubsw %%ymm3,%%ymm8,%%ymm0 \n" \
"vpmaddubsw %%ymm3,%%ymm9,%%ymm1 \n" \
"vpmaddubsw %%ymm3,%%ymm10,%%ymm2 \n" \
"vpaddw %%ymm4,%%ymm12,%%ymm4 \n" \
"vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
"vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \
"vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
#define YUVTORGB_REGS_AVX2 \
"xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
#else // Convert 16 pixels: 16 UV and 16 Y.
#define YUVTORGB_SETUP_AVX2(yuvconstants)
#define YUVTORGB16_AVX2(yuvconstants) \
"vpmaddubsw (%[yuvconstants]),%%ymm3,%%ymm0 \n" \
"vpmaddubsw 64(%[yuvconstants]),%%ymm3,%%ymm2 \n" \
"vpsllw $8,%%ymm3,%%ymm1 \n" \
"vbroadcastf128 256(%[yuvconstants]),%%ymm7 \n" \
"vpand %%ymm7,%%ymm1,%%ymm1 \n" \
"vpaddw %%ymm1,%%ymm0,%%ymm0 \n" \
"vpmaddubsw 32(%[yuvconstants]),%%ymm3,%%ymm1 \n" \
"vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \
"vbroadcastf128 272(%[yuvconstants]),%%ymm7 \n" \
"vpand %%ymm7,%%ymm3,%%ymm3 \n" \
"vpaddw %%ymm3,%%ymm2,%%ymm2 \n" \
"vpaddw %%ymm4,%%ymm0,%%ymm0 \n" \
"vmovdqu 128(%[yuvconstants]),%%ymm7 \n" \
"vpaddw %%ymm4,%%ymm7,%%ymm3 \n" \
"vpaddw %%ymm4,%%ymm2,%%ymm2 \n" \
"vmovdqu 96(%[yuvconstants]),%%ymm7 \n" \
"vpsubusw %%ymm7,%%ymm0,%%ymm0 \n" \
"vpsubusw %%ymm1,%%ymm3,%%ymm1 \n" \
"vmovdqu 160(%[yuvconstants]),%%ymm7 \n" \
"vpsubusw %%ymm7,%%ymm2,%%ymm2 \n"
"vpcmpeqb %%xmm0,%%xmm0,%%xmm0 \n" \
"vpsllw $7,%%xmm0,%%xmm0 \n" \
"vpbroadcastb %%xmm0,%%ymm0 \n" \
"vpsubb %%ymm0,%%ymm3,%%ymm3 \n" \
"vpmulhuw 96(%[yuvconstants]),%%ymm4,%%ymm4 \n" \
"vmovdqa (%[yuvconstants]),%%ymm0 \n" \
"vmovdqa 32(%[yuvconstants]),%%ymm1 \n" \
"vmovdqa 64(%[yuvconstants]),%%ymm2 \n" \
"vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" \
"vpmaddubsw %%ymm3,%%ymm1,%%ymm1 \n" \
"vpmaddubsw %%ymm3,%%ymm2,%%ymm2 \n" \
"vmovdqa 128(%[yuvconstants]),%%ymm3 \n" \
"vpaddw %%ymm4,%%ymm3,%%ymm4 \n" \
"vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
"vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \
"vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
#define YUVTORGB_REGS_AVX2 "xmm7",
#define YUVTORGB_REGS_AVX2
#endif
#define YUVTORGB_AVX2(yuvconstants) \
YUVTORGB16_AVX2(yuvconstants) \
"vpsrlw $0x6,%%ymm0,%%ymm0 \n" \
"vpsrlw $0x6,%%ymm1,%%ymm1 \n" \
"vpsrlw $0x6,%%ymm2,%%ymm2 \n" \
"vpsraw $0x6,%%ymm0,%%ymm0 \n" \
"vpsraw $0x6,%%ymm1,%%ymm1 \n" \
"vpsraw $0x6,%%ymm2,%%ymm2 \n" \
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
"vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
"vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
@ -3442,12 +3426,15 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
// Store 16 AR30 values.
#define STOREAR30_AVX2 \
"vpsrlw $0x4,%%ymm0,%%ymm0 \n" \
"vpsrlw $0x4,%%ymm1,%%ymm1 \n" \
"vpsrlw $0x4,%%ymm2,%%ymm2 \n" \
"vpminuw %%ymm6,%%ymm0,%%ymm0 \n" \
"vpminuw %%ymm6,%%ymm1,%%ymm1 \n" \
"vpminuw %%ymm6,%%ymm2,%%ymm2 \n" \
"vpsraw $0x4,%%ymm0,%%ymm0 \n" \
"vpsraw $0x4,%%ymm1,%%ymm1 \n" \
"vpsraw $0x4,%%ymm2,%%ymm2 \n" \
"vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \
"vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \
"vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \
"vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \
"vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \
"vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \
"vpsllw $0x4,%%ymm2,%%ymm2 \n" \
"vpermq $0xd8,%%ymm0,%%ymm0 \n" \
"vpermq $0xd8,%%ymm1,%%ymm1 \n" \
@ -3548,8 +3535,9 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
"vpsrlw $14,%%ymm5,%%ymm5 \n"
"vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
"vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 1023 for max
"vpsrlw $6,%%ymm6,%%ymm6 \n"
"vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
"vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
"vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN
"1: \n"
@ -3567,7 +3555,7 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
#endif // HAS_I422TOAR30ROW_AVX2
@ -3657,8 +3645,9 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
"vpsrlw $14,%%ymm5,%%ymm5 \n"
"vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
"vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 1023 for max
"vpsrlw $6,%%ymm6,%%ymm6 \n"
"vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
"vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
"vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN
"1: \n"
@ -3676,7 +3665,7 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
#endif // HAS_I210TOAR30ROW_AVX2
@ -3696,8 +3685,9 @@ void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
"vpsrlw $14,%%ymm5,%%ymm5 \n"
"vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
"vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 1023 for max
"vpsrlw $6,%%ymm6,%%ymm6 \n"
"vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
"vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
"vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN
"1: \n"
@ -3715,7 +3705,7 @@ void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
#endif // HAS_I212TOAR30ROW_AVX2
@ -3842,8 +3832,9 @@ void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
"vpsrlw $14,%%ymm5,%%ymm5 \n"
"vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
"vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 1023 for max
"vpsrlw $6,%%ymm6,%%ymm6 \n"
"vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
"vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
"vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN
"1: \n"
@ -3861,7 +3852,7 @@ void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
#endif // HAS_I410TOAR30ROW_AVX2
@ -4204,8 +4195,9 @@ void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf,
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
"vpsrlw $14,%%ymm5,%%ymm5 \n"
"vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
"vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 1023 for max
"vpsrlw $6,%%ymm6,%%ymm6 \n"
"vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
"vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
"vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN
"1: \n"
@ -4240,8 +4232,9 @@ void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf,
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
"vpsrlw $14,%%ymm5,%%ymm5 \n"
"vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
"vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 1023 for max
"vpsrlw $6,%%ymm6,%%ymm6 \n"
"vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
"vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
"vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN
"1: \n"
@ -4269,8 +4262,8 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
"movdqa 192(%3),%%xmm2 \n" // yg = 18997 = 1.164
"movdqa 224(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16
"movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164
"movdqa 128(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16
"pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000
"pslld $0x18,%%xmm4 \n"
@ -4314,8 +4307,8 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
"vmovdqa 192(%3),%%ymm2 \n" // yg = 18997 = 1.164
"vmovdqa 224(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16
"vmovdqa 96(%3),%%ymm2 \n" // yg = 18997 = 1.164
"vmovdqa 128(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000
"vpslld $0x18,%%ymm4,%%ymm4 \n"

View File

@ -75,28 +75,18 @@ extern "C" {
// Convert 8 pixels: 8 UV and 8 Y.
#define YUVTORGB(yuvconstants) \
xmm0 = _mm_loadu_si128(&xmm3); \
xmm1 = _mm_loadu_si128(&xmm3); \
xmm2 = _mm_loadu_si128(&xmm3); \
xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
xmm1 = _mm_slli_epi16(xmm1, 8); \
xmm1 = _mm_and_si128(xmm1, *(__m128i*)yuvconstants->kUVMaskBR); \
xmm0 = _mm_add_epi16(xmm0, xmm1); \
xmm1 = _mm_loadu_si128(&xmm3); \
xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8(0x80)); \
xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
xmm3 = _mm_and_si128(xmm3, *((__m128i*)(yuvconstants->kUVMaskBR) + 1)); \
xmm2 = _mm_add_epi16(xmm2, xmm3); \
xmm0 = _mm_add_epi16(xmm0, xmm4); \
xmm2 = _mm_add_epi16(xmm2, xmm4); \
xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kUVBiasG); \
xmm0 = _mm_subs_epu16(xmm0, *(__m128i*)yuvconstants->kUVBiasB); \
xmm1 = _mm_subs_epu16(xmm4, xmm1); \
xmm2 = _mm_subs_epu16(xmm2, *(__m128i*)yuvconstants->kUVBiasR); \
xmm0 = _mm_srli_epi16(xmm0, 6); \
xmm1 = _mm_srli_epi16(xmm1, 6); \
xmm2 = _mm_srli_epi16(xmm2, 6); \
xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \
xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3); \
xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3); \
xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3); \
xmm0 = _mm_adds_epi16(xmm4, xmm0); \
xmm1 = _mm_subs_epi16(xmm4, xmm1); \
xmm2 = _mm_adds_epi16(xmm4, xmm2); \
xmm0 = _mm_srai_epi16(xmm0, 6); \
xmm1 = _mm_srai_epi16(xmm1, 6); \
xmm2 = _mm_srai_epi16(xmm2, 6); \
xmm0 = _mm_packus_epi16(xmm0, xmm0); \
xmm1 = _mm_packus_epi16(xmm1, xmm1); \
xmm2 = _mm_packus_epi16(xmm2, xmm2);
@ -254,11 +244,11 @@ static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
// 7 bit fixed point 0.5.
static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
0x8080u, 0x8080u, 0x8080u, 0x8080u};
// 8 bit fixed point 0.5, for bias of UV.
static const ulvec8 kBiasUV128 = {
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
// Shuffle table for converting RGB24 to ARGB.
static const uvec8 kShuffleMaskRGB24ToARGB = {
@ -1447,7 +1437,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
movdqa xmm5, xmmword ptr kAddUV128
movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kARGBToV
movdqa xmm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v
@ -1519,7 +1509,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
movdqa xmm5, xmmword ptr kAddUVJ128
movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kARGBToVJ
movdqa xmm7, xmmword ptr kARGBToUJ
sub edi, edx // stride from u to v
@ -1593,7 +1583,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
vbroadcastf128 ymm5, xmmword ptr kAddUV128
vbroadcastf128 ymm5, xmmword ptr kBiasUV128
vbroadcastf128 ymm6, xmmword ptr kARGBToV
vbroadcastf128 ymm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v
@ -1661,7 +1651,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
vbroadcastf128 ymm5, xmmword ptr kAddUVJ128
vbroadcastf128 ymm5, xmmword ptr kBiasUV128
vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
sub edi, edx // stride from u to v
@ -1726,7 +1716,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // width
movdqa xmm5, xmmword ptr kAddUV128
movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kARGBToV
movdqa xmm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v
@ -1787,7 +1777,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
movdqa xmm5, xmmword ptr kAddUV128
movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kBGRAToV
movdqa xmm7, xmmword ptr kBGRAToU
sub edi, edx // stride from u to v
@ -1859,7 +1849,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
movdqa xmm5, xmmword ptr kAddUV128
movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kABGRToV
movdqa xmm7, xmmword ptr kABGRToU
sub edi, edx // stride from u to v
@ -1931,7 +1921,7 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
movdqa xmm5, xmmword ptr kAddUV128
movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kRGBAToV
movdqa xmm7, xmmword ptr kRGBAToU
sub edi, edx // stride from u to v
@ -2098,32 +2088,25 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb,
// Convert 16 pixels: 16 UV and 16 Y.
#define YUVTORGB_AVX2(YuvConstants) \
__asm { \
__asm vpmaddubsw ymm0, ymm3, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
__asm vpmaddubsw ymm2, ymm3, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
__asm vpsllw ymm1, ymm3, 8 \
__asm vbroadcastf128 ymm6, xmmword ptr [YuvConstants + KUMASKB] \
__asm vpand ymm1, ymm1, ymm6 \
__asm vpaddw ymm0, ymm0, ymm1 \
__asm vpmaddubsw ymm1, ymm3, ymmword ptr [YuvConstants + KUVTOG] /* B UV */\
__asm vpsubb ymm3, ymm3, ymmword ptr kBiasUV128 \
__asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
__asm vbroadcastf128 ymm6, xmmword ptr [YuvConstants + KVMASKR] \
__asm vpand ymm3, ymm3, ymm6 \
__asm vpaddw ymm2, ymm2, ymm3 \
__asm vpaddw ymm0, ymm0, ymm4 \
__asm vmovdqu ymm6, ymmword ptr [YuvConstants + KUVBIASG] \
__asm vpaddw ymm3, ymm4, ymm6 \
__asm vpaddw ymm2, ymm2, ymm4 \
__asm vmovdqu ymm6, ymmword ptr [YuvConstants + KUVBIASB] \
__asm vpsubusw ymm0, ymm0, ymm6 \
__asm vpsubusw ymm1, ymm3, ymm1 \
__asm vmovdqu ymm6, ymmword ptr [YuvConstants + KUVBIASR] \
__asm vpsubusw ymm2, ymm2, ymm6 \
__asm vpsrlw ymm0, ymm0, 6 \
__asm vpsrlw ymm1, ymm1, 6 \
__asm vpsrlw ymm2, ymm2, 6 \
__asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
__asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
__asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
__asm vmovdqa ymm0, ymmword ptr [YuvConstants + KUVTOB] \
__asm vmovdqa ymm1, ymmword ptr [YuvConstants + KUVTOG] \
__asm vmovdqa ymm2, ymmword ptr [YuvConstants + KUVTOR] \
__asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */ \
__asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */ \
__asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */ \
__asm vmovdqu ymm3, ymmword ptr [YuvConstants + KYBIASTORGB] \
__asm vpaddw ymm4, ymm3, ymm4 \
__asm vpaddsw ymm0, ymm0, ymm4 \
__asm vpsubsw ymm1, ymm4, ymm1 \
__asm vpaddsw ymm2, ymm2, ymm4 \
__asm vpsraw ymm0, ymm0, 6 \
__asm vpsraw ymm1, ymm1, 6 \
__asm vpsraw ymm2, ymm2, 6 \
__asm vpackuswb ymm0, ymm0, ymm0 \
__asm vpackuswb ymm1, ymm1, ymm1 \
__asm vpackuswb ymm2, ymm2, ymm2 \
}
// Store 16 ARGB values.
@ -2583,30 +2566,23 @@ __declspec(naked) void I422ToRGBARow_AVX2(
// Convert 8 pixels: 8 UV and 8 Y.
#define YUVTORGB(YuvConstants) \
__asm { \
__asm movdqa xmm0, xmm3 \
__asm movdqa xmm1, xmm3 \
__asm movdqa xmm2, xmm3 \
__asm pmaddubsw xmm0, xmmword ptr [YuvConstants + KUVTOB] \
__asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOR] \
__asm psllw xmm1, 8 \
__asm pand xmm1, xmmword ptr [YuvConstants + KUMASKB] \
__asm paddw xmm0, xmm1 \
__asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOG] \
__asm psubb xmm3, xmmword ptr kBiasUV128 \
__asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
__asm pand xmm3, xmmword ptr [YuvConstants + KVMASKR] \
__asm paddw xmm0, xmm4 \
__asm movdqa xmm6, xmmword ptr [YuvConstants + KUVBIASG] \
__asm paddw xmm2, xmm4 \
__asm paddw xmm4, xmm6 \
__asm movdqa xmm6, xmmword ptr [YuvConstants + KUVBIASG] \
__asm psubusw xmm0, xmm6 \
__asm psubusw xmm4, xmm1 \
__asm movdqa xmm6, xmmword ptr [YuvConstants + KUVBIASG] \
__asm psubusw xmm2, xmm6 \
__asm movdqa xmm0, xmmword ptr [YuvConstants + KUVTOB] \
__asm movdqa xmm1, xmmword ptr [YuvConstants + KUVTOG] \
__asm movdqa xmm2, xmmword ptr [YuvConstants + KUVTOR] \
__asm pmaddubsw xmm0, xmm3 \
__asm pmaddubsw xmm1, xmm3 \
__asm pmaddubsw xmm2, xmm3 \
__asm movdqa xmm3, xmmword ptr [YuvConstants + KYBIASTORGB] \
__asm paddw xmm4, xmm3 \
__asm paddsw xmm0, xmm4 \
__asm paddsw xmm2, xmm4 \
__asm psubsw xmm4, xmm1 \
__asm movdqa xmm1, xmm4 \
__asm psrlw xmm0, 6 \
__asm psrlw xmm1, 6 \
__asm psrlw xmm2, 6 \
__asm psraw xmm0, 6 \
__asm psraw xmm1, 6 \
__asm psraw xmm2, 6 \
__asm packuswb xmm0, xmm0 /* B */ \
__asm packuswb xmm1, xmm1 /* G */ \
__asm packuswb xmm2, xmm2 /* R */ \

View File

@ -11,7 +11,7 @@
#ifndef UNIT_TEST_UNIT_TEST_H_ // NOLINT
#define UNIT_TEST_UNIT_TEST_H_
#ifdef WIN32
#ifdef _WIN32
#include <windows.h>
#else
#include <sys/resource.h>