diff --git a/source/row_posix.cc b/source/row_posix.cc index 7b8533f2d..ae15e0ceb 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -1526,6 +1526,17 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = { "packuswb %%xmm1,%%xmm1 \n" \ "packuswb %%xmm2,%%xmm2 \n" +// Store 8 ARGB values. Assumes XMM5 is zero. +#define STOREARGB \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklbw %%xmm5,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm1 \n" \ + "punpcklwd %%xmm2,%%xmm0 \n" \ + "punpckhwd %%xmm2,%%xmm1 \n" \ + "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" \ + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1538,14 +1549,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, "1: \n" READYUV444 YUVTORGB(kYuvConstants) - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -1660,14 +1664,7 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, "1: \n" READYUV422 YUVTORGB(kYuvConstants) - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -1693,14 +1690,7 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, "1: \n" READYUV411 YUVTORGB(kYuvConstants) - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -1724,14 +1714,7 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, "1: \n" READNV12 YUVTORGB(kYuvConstants) - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -1754,14 +1737,7 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, "1: \n" READNV12 YUVTORGB(kYuvConstants) - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] diff --git a/source/row_win.cc b/source/row_win.cc index 90b269a1a..0a3439de2 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -316,7 +316,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { por xmm3, xmm5 movdqu [edx + 48], xmm3 lea edx, [edx + 64] - sub ecx, 16 + sub ecx, 16 jg convertloop ret } @@ -1772,6 +1772,19 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm packuswb xmm2, xmm2 /* R */ \ } +// Store 8 ARGB values. +#define STOREARGB __asm { \ + /* Step 3: Weave into ARGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm5 /* RA */ \ + __asm movdqa xmm1, xmm0 \ + __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ + __asm movdqu [edx], xmm0 \ + __asm movdqu [edx + 16], xmm1 \ + __asm lea edx, [edx + 32] \ + } + // 8 pixels. // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) __declspec(align(16)) @@ -1794,16 +1807,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, convertloop: READYUV444 YUVTORGB(kYuvConstants) - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // ABGR first 4 pixels - punpckhwd xmm1, xmm2 // ABGR next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] + STOREARGB sub ecx, 8 jg convertloop @@ -1996,16 +2000,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, convertloop: READYUV422 YUVTORGB(kYuvConstants) - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] + STOREARGB sub ecx, 8 jg convertloop @@ -2039,16 +2034,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, convertloop: READYUV411 // modifies EBX YUVTORGB(kYuvConstants) - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] + STOREARGB sub ecx, 8 jg convertloop @@ -2077,16 +2063,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, convertloop: READNV12 YUVTORGB(kYuvConstants) - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] + STOREARGB sub ecx, 8 jg convertloop @@ -2113,16 +2090,7 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf, convertloop: READNV12 YUVTORGB(kYvuConstants) - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] + STOREARGB sub ecx, 8 jg convertloop