diff --git a/README.chromium b/README.chromium index 3ccd4ff58..dcd742d72 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 807 +Version: 808 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 48d2ab025..7ddd501be 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -64,24 +64,26 @@ extern "C" { #define HAS_SOBELYROW_SSSE3 // Conversions: -#define HAS_ABGRTOYROW_SSSE3 -#define HAS_ARGBTOUV444ROW_SSSE3 -#define HAS_ARGBTOUVJROW_SSSE3 #define HAS_ABGRTOUVROW_SSSE3 +#define HAS_ABGRTOYROW_SSSE3 +#define HAS_ARGBSHUFFLEROW_SSE2 #define HAS_ARGBSHUFFLEROW_SSSE3 #define HAS_ARGBTOBAYERROW_SSSE3 +#define HAS_ARGBTOUV422ROW_SSSE3 +#define HAS_ARGBTOUV444ROW_SSSE3 +#define HAS_ARGBTOUVJROW_SSSE3 #define HAS_ARGBTOYJROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 -#define HAS_ARGBTOUV422ROW_SSSE3 -#define HAS_BGRATOYROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3 +#define HAS_BGRATOYROW_SSSE3 #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 #define HAS_COPYROW_X86 #define HAS_FIXEDDIV_X86 +#define HAS_HALFROW_SSE2 #define HAS_I400TOARGBROW_SSE2 -#define HAS_RGBATOYROW_SSSE3 #define HAS_RGBATOUVROW_SSSE3 +#define HAS_RGBATOYROW_SSSE3 #define HAS_SETROW_X86 #define HAS_UYVYTOUVROW_SSE2 #define HAS_UYVYTOYROW_SSE2 @@ -103,7 +105,6 @@ extern "C" { #define HAS_ARGBTORAWROW_SSSE3 #define HAS_ARGBTORGB24ROW_SSSE3 #define HAS_ARGBTORGB565ROW_SSE2 -#define HAS_HALFROW_SSE2 #define HAS_I411TOARGBROW_SSSE3 #define HAS_I422TOABGRROW_SSSE3 #define HAS_I422TOARGB1555ROW_SSSE3 @@ -163,8 +164,6 @@ extern "C" { // The following are Windows only: // TODO(fbarchard): Port to gcc. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) -#define HAS_ARGBSHUFFLEROW_SSE2 - // Effects: // TODO(fbarchard): Optimize and enable // #define HAS_ARGBLUMACOLORTABLEROW_SSSE3 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 8f9e8d5db..b2f392368 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 807 +#define LIBYUV_VERSION 808 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_posix.cc b/source/row_posix.cc index 839c7e370..41e607938 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -40,6 +40,9 @@ extern "C" { #define MEMOPMEM(opcode, reg, offset, base, index, scale) \ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ #opcode " %%" #reg ",(%%r15,%%r14)\n" +#define MEMOP(opcode, offset, base, index, scale) \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " (%%r15,%%r14)" #define BUNDLEALIGN ".p2align 5 \n" #else #define MEMACCESS(base) "(%" #base ")" @@ -53,6 +56,8 @@ extern "C" { #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n" #define MEMOPMEM(opcode, reg, offset, base, index, scale) \ #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" +#define MEMOP(opcode, offset, base, index, scale) \ + #opcode " " #offset "(%" #base ",%" #index "," #scale ")" #define BUNDLEALIGN #endif @@ -5625,11 +5630,11 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, "sub %0,%1 \n" ".p2align 4 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "pavgb (%0,%3),%%xmm0 \n" + "movdqa "MEMACCESS(0)",%%xmm0 \n" + MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3),%%xmm0 "sub $0x10,%2 \n" - "movdqa %%xmm0,(%0,%1) \n" - "lea 0x10(%0),%0 \n" + MEMOPMEM(movdqa,xmm0,0x00,0,1,1) // movdqa %%xmm0,(%0,%1) + "lea "MEMLEA(0x10,0)",%0 \n" "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 @@ -5736,9 +5741,7 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { asm volatile ( - "vmovdqa "MEMACCESS(3)",%%xmm5 \n" - "vpermq $0x44,%%ymm5,%%ymm5 \n" - + "vbroadcastf128 "MEMACCESS(3)",%%ymm5 \n" ".p2align 4 \n" "1: \n" "vmovdqu "MEMACCESS(0)",%%ymm0 \n" @@ -5763,6 +5766,132 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, } #endif // HAS_ARGBSHUFFLEROW_AVX2 +#ifdef HAS_ARGBSHUFFLEROW_SSE2 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + uintptr_t pixel_temp = 0u; + asm volatile ( + "pxor %%xmm5,%%xmm5 \n" + "mov "MEMACCESS(4)",%k2 \n" + "cmp $0x3000102,%k2 \n" + "je 3012f \n" + "cmp $0x10203,%k2 \n" + "je 123f \n" + "cmp $0x30201,%k2 \n" + "je 321f \n" + "cmp $0x2010003,%k2 \n" + "je 2103f \n" + + "1: \n" + BUNDLEALIGN + "movzb "MEMACCESS(4)",%2 \n" + MEMOP(movzb,0x00,0,2,1)",%2 \n" // movzb (%0,%2,1),%2 + "mov %b2,"MEMACCESS(1)" \n" + "movzb "MEMACCESS2(0x1,4)",%2 \n" + MEMOP(movzb,0x00,0,2,1)",%2 \n" // movzb (%0,%2,1),%2 + "mov %b2,"MEMACCESS2(0x1,1)" \n" + BUNDLEALIGN + "movzb "MEMACCESS2(0x2,4)",%2 \n" + MEMOP(movzb,0x00,0,2,1)",%2 \n" // movzb (%0,%2,1),%2 + "mov %b2,"MEMACCESS2(0x2,1)" \n" + "movzb "MEMACCESS2(0x3,4)",%2 \n" + MEMOP(movzb,0x00,0,2,1)",%2 \n" // movzb (%0,%2,1),%2 + "mov %b2,"MEMACCESS2(0x3,1)" \n" + "lea "MEMLEA(0x4,0)",%0 \n" + "lea "MEMLEA(0x4,1)",%1 \n" + "sub $0x1,%3 \n" + "jg 1b \n" + "jmp 99f \n" + + ".p2align 4 \n" + "123: \n" + "movdqu "MEMACCESS(0)",%%xmm0 \n" + "lea "MEMLEA(0x10,0)",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0x1b,%%xmm0,%%xmm0 \n" + "pshuflw $0x1b,%%xmm0,%%xmm0 \n" + "pshufhw $0x1b,%%xmm1,%%xmm1 \n" + "pshuflw $0x1b,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0,"MEMACCESS(1)" \n" + "lea "MEMLEA(0x10,1)",%1 \n" + "jg 123b \n" + "jmp 99f \n" + + ".p2align 4 \n" + "321: \n" + "movdqu "MEMACCESS(0)",%%xmm0 \n" + "lea "MEMLEA(0x10,0)",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0x39,%%xmm0,%%xmm0 \n" + "pshuflw $0x39,%%xmm0,%%xmm0 \n" + "pshufhw $0x39,%%xmm1,%%xmm1 \n" + "pshuflw $0x39,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0,"MEMACCESS(1)" \n" + "lea "MEMLEA(0x10,1)",%1 \n" + "jg 321b \n" + "jmp 99f \n" + + ".p2align 4 \n" + "2103: \n" + "movdqu "MEMACCESS(0)",%%xmm0 \n" + "lea "MEMLEA(0x10,0)",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0x93,%%xmm0,%%xmm0 \n" + "pshuflw $0x93,%%xmm0,%%xmm0 \n" + "pshufhw $0x93,%%xmm1,%%xmm1 \n" + "pshuflw $0x93,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0,"MEMACCESS(1)" \n" + "lea "MEMLEA(0x10,1)",%1 \n" + "jg 2103b \n" + "jmp 99f \n" + + ".p2align 4 \n" + "3012: \n" + "movdqu "MEMACCESS(0)",%%xmm0 \n" + "lea "MEMLEA(0x10,0)",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0xc6,%%xmm0,%%xmm0 \n" + "pshuflw $0xc6,%%xmm0,%%xmm0 \n" + "pshufhw $0xc6,%%xmm1,%%xmm1 \n" + "pshuflw $0xc6,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0,"MEMACCESS(1)" \n" + "lea "MEMLEA(0x10,1)",%1 \n" + "jg 3012b \n" + + "99: \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+d"(pixel_temp), // %2 + "+r"(pix) // %3 + : "r"(shuffler) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_ARGBSHUFFLEROW_SSE2 + #ifdef HAS_I422TOYUY2ROW_SSE2 void I422ToYUY2Row_SSE2(const uint8* src_y, const uint8* src_u, diff --git a/source/row_win.cc b/source/row_win.cc index b3c4c4735..4a554a828 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -6627,7 +6627,7 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, ret } } -#endif +#endif // HAS_ARGBSHUFFLEROW_AVX2 __declspec(naked) __declspec(align(16)) void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, @@ -6639,7 +6639,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, mov edx, [esp + 8 + 8] // dst_argb mov esi, [esp + 8 + 12] // shuffler mov ecx, [esp + 8 + 16] // pix - pxor xmm7, xmm7 + pxor xmm5, xmm5 mov ebx, [esi] // shuffler cmp ebx, 0x03000102 @@ -6676,8 +6676,8 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, movdqu xmm0, [eax] lea eax, [eax + 16] movdqa xmm1, xmm0 - punpcklbw xmm0, xmm7 - punpckhbw xmm1, xmm7 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB pshuflw xmm0, xmm0, 01Bh pshufhw xmm1, xmm1, 01Bh @@ -6694,8 +6694,8 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, movdqu xmm0, [eax] lea eax, [eax + 16] movdqa xmm1, xmm0 - punpcklbw xmm0, xmm7 - punpckhbw xmm1, xmm7 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB pshuflw xmm0, xmm0, 039h pshufhw xmm1, xmm1, 039h @@ -6712,8 +6712,8 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, movdqu xmm0, [eax] lea eax, [eax + 16] movdqa xmm1, xmm0 - punpcklbw xmm0, xmm7 - punpckhbw xmm1, xmm7 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA pshuflw xmm0, xmm0, 093h pshufhw xmm1, xmm1, 093h @@ -6730,8 +6730,8 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, movdqu xmm0, [eax] lea eax, [eax + 16] movdqa xmm1, xmm0 - punpcklbw xmm0, xmm7 - punpckhbw xmm1, xmm7 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB pshuflw xmm0, xmm0, 0C6h pshufhw xmm1, xmm1, 0C6h