From 540e8af80c027b81d2bf244c37355cd0df1f8038 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Tue, 2 Dec 2014 22:37:47 +0000 Subject: [PATCH] remove add 16 from ARGBToYJ and add rounding, for consistency with Windows version. row.h header macros sorted alphabetically. BUG=269 TESTED=untested R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/32579005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1185 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 255 ++++++++++++++++++++------------------- include/libyuv/version.h | 2 +- source/row_posix.cc | 25 ++-- source/row_win.cc | 8 +- 5 files changed, 151 insertions(+), 141 deletions(-) diff --git a/README.chromium b/README.chromium index 94d2030dc..7b759bf48 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1184 +Version: 1185 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index ef0c36f0c..653ba73a5 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -57,38 +57,7 @@ extern "C" { // The following are available on all x86 platforms: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) -// Effects: -#define HAS_ARGBADDROW_SSE2 -#define HAS_ARGBAFFINEROW_SSE2 -#define HAS_ARGBATTENUATEROW_SSSE3 -#define HAS_ARGBBLENDROW_SSSE3 -#define HAS_ARGBCOLORMATRIXROW_SSSE3 -#define HAS_ARGBCOLORTABLEROW_X86 -#define HAS_ARGBCOPYALPHAROW_SSE2 -#define HAS_ARGBCOPYYTOALPHAROW_SSE2 -#define HAS_ARGBGRAYROW_SSSE3 -#define HAS_ARGBLUMACOLORTABLEROW_SSSE3 -#define HAS_ARGBMIRRORROW_SSE2 -#define HAS_ARGBMULTIPLYROW_SSE2 -#define HAS_ARGBPOLYNOMIALROW_SSE2 -#define HAS_ARGBQUANTIZEROW_SSE2 -#define HAS_ARGBSEPIAROW_SSSE3 -#define HAS_ARGBSHADEROW_SSE2 -#define HAS_ARGBSUBTRACTROW_SSE2 -#define HAS_ARGBUNATTENUATEROW_SSE2 -#define HAS_COMPUTECUMULATIVESUMROW_SSE2 -#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 -#define HAS_INTERPOLATEROW_SSE2 -#define HAS_INTERPOLATEROW_SSSE3 -#define HAS_RGBCOLORTABLEROW_X86 -#define HAS_SOBELROW_SSE2 -#define HAS_SOBELTOPLANEROW_SSE2 -#define HAS_SOBELXROW_SSE2 -#define HAS_SOBELXYROW_SSE2 -#define HAS_SOBELYROW_SSE2 - // Conversions: -#define HAS_ARGBTOUVROW_SSSE3 #define HAS_ABGRTOUVROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3 #define HAS_ARGB1555TOARGBROW_SSE2 @@ -105,6 +74,7 @@ extern "C" { #define HAS_ARGBTOUV422ROW_SSSE3 #define HAS_ARGBTOUV444ROW_SSSE3 #define HAS_ARGBTOUVJROW_SSSE3 +#define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOYJROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3 @@ -113,9 +83,9 @@ extern "C" { #define HAS_COPYROW_SSE2 #define HAS_I400TOARGBROW_SSE2 #define HAS_I411TOARGBROW_SSSE3 -#define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOABGRROW_SSSE3 #define HAS_I422TOARGB1555ROW_SSSE3 +#define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB4444ROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3 #define HAS_I422TOBGRAROW_SSSE3 @@ -153,6 +123,36 @@ extern "C" { #define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOYROW_SSE2 + +// Effects: +#define HAS_ARGBADDROW_SSE2 +#define HAS_ARGBAFFINEROW_SSE2 +#define HAS_ARGBATTENUATEROW_SSSE3 +#define HAS_ARGBBLENDROW_SSSE3 +#define HAS_ARGBCOLORMATRIXROW_SSSE3 +#define HAS_ARGBCOLORTABLEROW_X86 +#define HAS_ARGBCOPYALPHAROW_SSE2 +#define HAS_ARGBCOPYYTOALPHAROW_SSE2 +#define HAS_ARGBGRAYROW_SSSE3 +#define HAS_ARGBLUMACOLORTABLEROW_SSSE3 +#define HAS_ARGBMIRRORROW_SSE2 +#define HAS_ARGBMULTIPLYROW_SSE2 +#define HAS_ARGBPOLYNOMIALROW_SSE2 +#define HAS_ARGBQUANTIZEROW_SSE2 +#define HAS_ARGBSEPIAROW_SSSE3 +#define HAS_ARGBSHADEROW_SSE2 +#define HAS_ARGBSUBTRACTROW_SSE2 +#define HAS_ARGBUNATTENUATEROW_SSE2 +#define HAS_COMPUTECUMULATIVESUMROW_SSE2 +#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 +#define HAS_INTERPOLATEROW_SSE2 +#define HAS_INTERPOLATEROW_SSSE3 +#define HAS_RGBCOLORTABLEROW_X86 +#define HAS_SOBELROW_SSE2 +#define HAS_SOBELTOPLANEROW_SSE2 +#define HAS_SOBELXROW_SSE2 +#define HAS_SOBELXYROW_SSE2 +#define HAS_SOBELYROW_SSE2 #endif // The following are available on x64 Visual C: @@ -184,33 +184,33 @@ extern "C" { // The code supports NaCL but requires a new compiler and validator. #if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \ defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) -#define HAS_COPYROW_AVX -#define HAS_ARGBPOLYNOMIALROW_AVX2 -#define HAS_ARGBSHUFFLEROW_AVX2 #define HAS_ARGBCOPYALPHAROW_AVX2 #define HAS_ARGBCOPYYTOALPHAROW_AVX2 +#define HAS_ARGBMIRRORROW_AVX2 +#define HAS_ARGBPOLYNOMIALROW_AVX2 +#define HAS_ARGBSHUFFLEROW_AVX2 +#define HAS_ARGBTOYJROW_AVX2 +#define HAS_ARGBTOYROW_AVX2 +#define HAS_COPYROW_AVX +#define HAS_I422TOABGRROW_AVX2 #define HAS_I422TOARGBROW_AVX2 #define HAS_I422TOBGRAROW_AVX2 -#define HAS_I422TOABGRROW_AVX2 #define HAS_I422TORGBAROW_AVX2 -#define HAS_YUY2TOYROW_AVX2 -#define HAS_YUY2TOUV422ROW_AVX2 -#define HAS_YUY2TOUVROW_AVX2 -#define HAS_UYVYTOYROW_AVX2 -#define HAS_UYVYTOUV422ROW_AVX2 -#define HAS_UYVYTOUVROW_AVX2 -#define HAS_SPLITUVROW_AVX2 #define HAS_MERGEUVROW_AVX2 #define HAS_MIRRORROW_AVX2 -#define HAS_ARGBMIRRORROW_AVX2 -#define HAS_ARGBTOYROW_AVX2 -#define HAS_ARGBTOYJROW_AVX2 +#define HAS_SPLITUVROW_AVX2 +#define HAS_UYVYTOUV422ROW_AVX2 +#define HAS_UYVYTOUVROW_AVX2 +#define HAS_UYVYTOYROW_AVX2 +#define HAS_YUY2TOUV422ROW_AVX2 +#define HAS_YUY2TOUVROW_AVX2 +#define HAS_YUY2TOYROW_AVX2 // Effects: #define HAS_ARGBADDROW_AVX2 -#define HAS_ARGBSUBTRACTROW_AVX2 -#define HAS_ARGBMULTIPLYROW_AVX2 #define HAS_ARGBATTENUATEROW_AVX2 +#define HAS_ARGBMULTIPLYROW_AVX2 +#define HAS_ARGBSUBTRACTROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 #endif @@ -240,100 +240,101 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ !defined(LIBYUV_SSSE3_ONLY) -#define HAS_ARGBBLENDROW_SSE2 #define HAS_ARGBATTENUATEROW_SSE2 +#define HAS_ARGBBLENDROW_SSE2 #define HAS_MIRRORROW_SSE2 #endif // The following are available on arm64 platforms: +// TODO(fbarchard): Merge arm64 and arm7 neon macros. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) -#define HAS_I444TOARGBROW_NEON -#define HAS_I422TOARGBROW_NEON -#define HAS_I411TOARGBROW_NEON -#define HAS_I422TOBGRAROW_NEON -#define HAS_I422TOABGRROW_NEON -#define HAS_I422TORGBAROW_NEON -#define HAS_I422TORGB24ROW_NEON -#define HAS_I422TORAWROW_NEON -#define HAS_I422TORGB565ROW_NEON -#define HAS_I422TOARGB1555ROW_NEON -#define HAS_I422TOARGB4444ROW_NEON -#define HAS_YTOARGBROW_NEON -#define HAS_I400TOARGBROW_NEON -#define HAS_NV12TOARGBROW_NEON -#define HAS_NV21TOARGBROW_NEON -#define HAS_NV12TORGB565ROW_NEON -#define HAS_NV21TORGB565ROW_NEON -#define HAS_YUY2TOARGBROW_NEON -#define HAS_UYVYTOARGBROW_NEON -#define HAS_SPLITUVROW_NEON -#define HAS_MERGEUVROW_NEON -#define HAS_COPYROW_NEON -#define HAS_SETROW_NEON -#define HAS_ARGBSETROWS_NEON -#define HAS_MIRRORROW_NEON -#define HAS_MIRRORUVROW_NEON -#define HAS_ARGBMIRRORROW_NEON -#define HAS_RGB24TOARGBROW_NEON -#define HAS_RAWTOARGBROW_NEON -#define HAS_RGB565TOARGBROW_NEON +#define HAS_ABGRTOUVROW_NEON +#define HAS_ABGRTOYROW_NEON #define HAS_ARGB1555TOARGBROW_NEON +#define HAS_ARGB1555TOUVROW_NEON +#define HAS_ARGB1555TOYROW_NEON #define HAS_ARGB4444TOARGBROW_NEON -#define HAS_ARGBTORGB24ROW_NEON -#define HAS_ARGBTORAWROW_NEON -#define HAS_YUY2TOYROW_NEON -#define HAS_UYVYTOYROW_NEON -#define HAS_YUY2TOUV422ROW_NEON -#define HAS_UYVYTOUV422ROW_NEON -#define HAS_YUY2TOUVROW_NEON -#define HAS_UYVYTOUVROW_NEON -#define HAS_ARGBTOBAYERROW_NEON -#define HAS_ARGBTOBAYERGGROW_NEON +#define HAS_ARGB4444TOUVROW_NEON +#define HAS_ARGB4444TOYROW_NEON +#define HAS_ARGBADDROW_NEON +#define HAS_ARGBATTENUATEROW_NEON +#define HAS_ARGBBLENDROW_NEON +#define HAS_ARGBCOLORMATRIXROW_NEON +#define HAS_ARGBGRAYROW_NEON +#define HAS_ARGBMIRRORROW_NEON +#define HAS_ARGBMULTIPLYROW_NEON +#define HAS_ARGBQUANTIZEROW_NEON +#define HAS_ARGBSEPIAROW_NEON +#define HAS_ARGBSETROWS_NEON +#define HAS_ARGBSHADEROW_NEON #define HAS_ARGBSHUFFLEROW_NEON -#define HAS_I422TOYUY2ROW_NEON -#define HAS_I422TOUYVYROW_NEON -#define HAS_ARGBTORGB565ROW_NEON +#define HAS_ARGBSUBTRACTROW_NEON #define HAS_ARGBTOARGB1555ROW_NEON #define HAS_ARGBTOARGB4444ROW_NEON -#define HAS_ARGBTOYROW_NEON -#define HAS_ARGBTOYJROW_NEON -#define HAS_ARGBTOUV444ROW_NEON -#define HAS_ARGBTOUV422ROW_NEON +#define HAS_ARGBTOBAYERGGROW_NEON +#define HAS_ARGBTOBAYERROW_NEON +#define HAS_ARGBTORAWROW_NEON +#define HAS_ARGBTORGB24ROW_NEON +#define HAS_ARGBTORGB565ROW_NEON #define HAS_ARGBTOUV411ROW_NEON -#define HAS_ARGBTOUVROW_NEON +#define HAS_ARGBTOUV422ROW_NEON +#define HAS_ARGBTOUV444ROW_NEON #define HAS_ARGBTOUVJROW_NEON +#define HAS_ARGBTOUVROW_NEON +#define HAS_ARGBTOYJROW_NEON +#define HAS_ARGBTOYROW_NEON #define HAS_BGRATOUVROW_NEON -#define HAS_ABGRTOUVROW_NEON -#define HAS_RGBATOUVROW_NEON -#define HAS_RGB24TOUVROW_NEON -#define HAS_RAWTOUVROW_NEON -#define HAS_RGB565TOUVROW_NEON -#define HAS_ARGB1555TOUVROW_NEON -#define HAS_ARGB4444TOUVROW_NEON -#define HAS_RGB565TOYROW_NEON -#define HAS_ARGB1555TOYROW_NEON -#define HAS_ARGB4444TOYROW_NEON #define HAS_BGRATOYROW_NEON -#define HAS_ABGRTOYROW_NEON -#define HAS_RGBATOYROW_NEON -#define HAS_RGB24TOYROW_NEON -#define HAS_RAWTOYROW_NEON +#define HAS_COPYROW_NEON +#define HAS_I400TOARGBROW_NEON +#define HAS_I411TOARGBROW_NEON +#define HAS_I422TOABGRROW_NEON +#define HAS_I422TOARGB1555ROW_NEON +#define HAS_I422TOARGB4444ROW_NEON +#define HAS_I422TOARGBROW_NEON +#define HAS_I422TOBGRAROW_NEON +#define HAS_I422TORAWROW_NEON +#define HAS_I422TORGB24ROW_NEON +#define HAS_I422TORGB565ROW_NEON +#define HAS_I422TORGBAROW_NEON +#define HAS_I422TOUYVYROW_NEON +#define HAS_I422TOYUY2ROW_NEON +#define HAS_I444TOARGBROW_NEON #define HAS_INTERPOLATEROW_NEON -#define HAS_ARGBBLENDROW_NEON -#define HAS_ARGBATTENUATEROW_NEON -#define HAS_ARGBQUANTIZEROW_NEON -#define HAS_ARGBSHADEROW_NEON -#define HAS_ARGBGRAYROW_NEON -#define HAS_ARGBSEPIAROW_NEON -#define HAS_ARGBCOLORMATRIXROW_NEON -#define HAS_ARGBMULTIPLYROW_NEON -#define HAS_ARGBADDROW_NEON -#define HAS_ARGBSUBTRACTROW_NEON +#define HAS_MERGEUVROW_NEON +#define HAS_MIRRORROW_NEON +#define HAS_MIRRORUVROW_NEON +#define HAS_NV12TOARGBROW_NEON +#define HAS_NV12TORGB565ROW_NEON +#define HAS_NV21TOARGBROW_NEON +#define HAS_NV21TORGB565ROW_NEON +#define HAS_RAWTOARGBROW_NEON +#define HAS_RAWTOUVROW_NEON +#define HAS_RAWTOYROW_NEON +#define HAS_RGB24TOARGBROW_NEON +#define HAS_RGB24TOUVROW_NEON +#define HAS_RGB24TOYROW_NEON +#define HAS_RGB565TOARGBROW_NEON +#define HAS_RGB565TOUVROW_NEON +#define HAS_RGB565TOYROW_NEON +#define HAS_RGBATOUVROW_NEON +#define HAS_RGBATOYROW_NEON +#define HAS_SETROW_NEON #define HAS_SOBELROW_NEON #define HAS_SOBELTOPLANEROW_NEON -#define HAS_SOBELXYROW_NEON #define HAS_SOBELXROW_NEON +#define HAS_SOBELXYROW_NEON #define HAS_SOBELYROW_NEON +#define HAS_SPLITUVROW_NEON +#define HAS_UYVYTOARGBROW_NEON +#define HAS_UYVYTOUV422ROW_NEON +#define HAS_UYVYTOUVROW_NEON +#define HAS_UYVYTOYROW_NEON +#define HAS_YTOARGBROW_NEON +#define HAS_YUY2TOARGBROW_NEON +#define HAS_YUY2TOUV422ROW_NEON +#define HAS_YUY2TOUVROW_NEON +#define HAS_YUY2TOYROW_NEON #endif // The following are available on Neon platforms: @@ -349,18 +350,18 @@ extern "C" { #define HAS_ARGB4444TOYROW_NEON #define HAS_ARGBTOARGB1555ROW_NEON #define HAS_ARGBTOARGB4444ROW_NEON -#define HAS_ARGBTOBAYERROW_NEON #define HAS_ARGBTOBAYERGGROW_NEON +#define HAS_ARGBTOBAYERROW_NEON #define HAS_ARGBTORAWROW_NEON #define HAS_ARGBTORGB24ROW_NEON #define HAS_ARGBTORGB565ROW_NEON #define HAS_ARGBTOUV411ROW_NEON #define HAS_ARGBTOUV422ROW_NEON #define HAS_ARGBTOUV444ROW_NEON -#define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOUVJROW_NEON -#define HAS_ARGBTOYROW_NEON +#define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON +#define HAS_ARGBTOYROW_NEON #define HAS_BGRATOUVROW_NEON #define HAS_BGRATOYROW_NEON #define HAS_COPYROW_NEON @@ -419,12 +420,12 @@ extern "C" { #define HAS_ARGBSEPIAROW_NEON #define HAS_ARGBSHADEROW_NEON #define HAS_ARGBSUBTRACTROW_NEON +#define HAS_INTERPOLATEROW_NEON #define HAS_SOBELROW_NEON #define HAS_SOBELTOPLANEROW_NEON -#define HAS_SOBELXYROW_NEON #define HAS_SOBELXROW_NEON +#define HAS_SOBELXYROW_NEON #define HAS_SOBELYROW_NEON -#define HAS_INTERPOLATEROW_NEON // TODO(fbarchard): Investigate neon unittest failure. // #define HAS_ARGBCOLORMATRIXROW_NEON #endif diff --git a/include/libyuv/version.h b/include/libyuv/version.h index d6209bf43..8ef03888e 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1184 +#define LIBYUV_VERSION 1185 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_posix.cc b/source/row_posix.cc index ea33f8606..aba410dc9 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -92,6 +92,7 @@ static uvec8 kAddY16 = { 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u }; +// 7 bit fixed point 0.5. static vec16 kAddYJ64 = { 64, 64, 64, 64, 64, 64, 64, 64 }; @@ -704,6 +705,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { #endif // HAS_RGB24TOARGBROW_SSSE3 #ifdef HAS_ARGBTOYROW_SSSE3 +// Convert 16 ARGB pixels (64 bytes) to 16 Y values. void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( "movdqa %3,%%xmm4 \n" @@ -743,6 +745,8 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { #endif // HAS_ARGBTOYROW_SSSE3 #ifdef HAS_ARGBTOYJROW_SSSE3 +// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. +// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( "movdqa %3,%%xmm4 \n" @@ -788,6 +792,7 @@ static const lvec32 kPermdARGBToY_AVX = { 0, 4, 1, 5, 2, 6, 3, 7 }; +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( "vbroadcastf128 %3,%%ymm4 \n" @@ -804,13 +809,13 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" "lea " MEMLEA(0x80,0) ",%0 \n" - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" "vpsrlw $0x7,%%ymm0,%%ymm0 \n" "vpsrlw $0x7,%%ymm2,%%ymm2 \n" - "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" - "vpermd %%ymm0,%%ymm6,%%ymm0 \n" - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y "vmovdqu %%ymm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x20,%2 \n" @@ -831,6 +836,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { #endif // HAS_ARGBTOYROW_AVX2 #ifdef HAS_ARGBTOYJROW_AVX2 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( "vbroadcastf128 %3,%%ymm4 \n" @@ -847,13 +853,14 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" "lea " MEMLEA(0x80,0) ",%0 \n" - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding. + "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" "vpsrlw $0x7,%%ymm0,%%ymm0 \n" "vpsrlw $0x7,%%ymm2,%%ymm2 \n" - "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" - "vpermd %%ymm0,%%ymm6,%%ymm0 \n" - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. "vmovdqu %%ymm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x20,%2 \n" @@ -863,7 +870,7 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { "+r"(dst_y), // %1 "+r"(pix) // %2 : "m"(kARGBToYJ), // %3 - "m"(kAddY16), // %4 + "m"(kAddYJ64), // %4 "m"(kPermdARGBToY_AVX) // %5 : "memory", "cc" #if defined(__SSE2__) diff --git a/source/row_win.cc b/source/row_win.cc index 247d55d54..fae111cbf 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -210,6 +210,7 @@ static const uvec8 kAddY16 = { 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u }; +// 7 bit fixed point 0.5. static const vec16 kAddYJ64 = { 64, 64, 64, 64, 64, 64, 64, 64 }; @@ -697,8 +698,8 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ - movdqa xmm5, kAddY16 movdqa xmm4, kARGBToY + movdqa xmm5, kAddY16 convertloop: movdqu xmm0, [eax] @@ -724,7 +725,8 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { } } -// Convert 16 ARGB pixels (64 bytes) to 16 Y values. +// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. +// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. __declspec(naked) __declspec(align(16)) void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { @@ -787,7 +789,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { vpsrlw ymm2, ymm2, 7 vpackuswb ymm0, ymm0, ymm2 // mutates. vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. - vpaddb ymm0, ymm0, ymm5 + vpaddb ymm0, ymm0, ymm5 // add 16 for Y vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32