From 18184fd19dba08d6567357e3913285a779e4b9f3 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 12 Mar 2012 18:53:19 +0000 Subject: [PATCH] switch looping to jg from ja to allow non-multiple of 16 to underflow to a negative BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/453001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@214 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/compare.cc | 10 +- source/convert.cc | 16 +-- source/convert_from.cc | 261 ++++++++++++++++++------------------ source/format_conversion.cc | 8 +- source/planar_functions.cc | 4 +- source/rotate.cc | 26 ++-- source/row_neon.cc | 10 +- source/row_posix.cc | 135 ++++++++++--------- source/row_win.cc | 136 ++++++++++--------- source/scale.cc | 134 +++++++++--------- 12 files changed, 374 insertions(+), 370 deletions(-) diff --git a/README.chromium b/README.chromium index c6991f0fe..efb9f69ea 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 213 +Version: 214 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 18f1f2f75..4d0444779 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 213 +#define LIBYUV_VERSION 214 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/compare.cc b/source/compare.cc index 7d188d082..c57a59162 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -58,7 +58,7 @@ static uint32 SumSquareError_NEON(const uint8* src_a, "vmlal.s16 q8, d5, d5 \n" "vmlal.s16 q10, d7, d7 \n" "subs %2, %2, #16 \n" - "bhi 1b \n" + "bgt 1b \n" "vadd.u32 q7, q7, q8 \n" "vadd.u32 q9, q9, q10 \n" @@ -94,6 +94,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, movdqa xmm1, [eax] movdqa xmm2, [eax + edx] lea eax, [eax + 16] + sub ecx, 16 movdqa xmm3, xmm1 psubusb xmm1, xmm2 psubusb xmm2, xmm3 @@ -105,8 +106,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, pmaddwd xmm2, xmm2 paddd xmm0, xmm1 paddd xmm0, xmm2 - sub ecx, 16 - ja wloop + jg wloop pshufd xmm1, xmm0, 0EEh paddd xmm0, xmm1 @@ -131,6 +131,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, "movdqa (%0),%%xmm1 \n" "movdqa (%0,%1,1),%%xmm2 \n" "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" "movdqa %%xmm1,%%xmm3 \n" "psubusb %%xmm2,%%xmm1 \n" "psubusb %%xmm3,%%xmm2 \n" @@ -142,8 +143,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, "pmaddwd %%xmm2,%%xmm2 \n" "paddd %%xmm1,%%xmm0 \n" "paddd %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" "pshufd $0xee,%%xmm0,%%xmm1 \n" "paddd %%xmm1,%%xmm0 \n" diff --git a/source/convert.cc b/source/convert.cc index 02e0a06f7..af4d2693b 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -77,10 +77,10 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, convertloop: movdqa xmm0, [eax] pavgb xmm0, [eax + edx] + sub ecx, 16 movdqa [eax + edi], xmm0 lea eax, [eax + 16] - sub ecx, 16 - ja convertloop + jg convertloop pop edi ret } @@ -95,10 +95,10 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, "1: \n" "movdqa (%0),%%xmm0 \n" "pavgb (%0,%3),%%xmm0 \n" + "sub $0x10,%2 \n" "movdqa %%xmm0,(%0,%1) \n" "lea 0x10(%0),%0 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 "+r"(pix) // %2 @@ -495,10 +495,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, lea esi, [esi + 8] psrlw xmm1, 8 // V packuswb xmm1, xmm1 + sub ecx, 16 movq qword ptr [edi], xmm1 lea edi, [edi + 8] - sub ecx, 16 - ja convertloop + jg convertloop pop edi pop esi @@ -534,10 +534,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, "lea 0x8(%2),%2 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" + "sub $0x10,%4 \n" "movq %%xmm1,(%3) \n" "lea 0x8(%3),%3 \n" - "sub $0x10,%4 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(dst_u), // %2 diff --git a/source/convert_from.cc b/source/convert_from.cc index 0893eed71..7e41e2f8b 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -237,7 +237,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y, movdqa [edi + 16], xmm1 lea edi, [edi + 32] sub ecx, 16 - ja convertloop + jg convertloop pop edi pop esi @@ -276,7 +276,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y, movdqa [edi + 16], xmm2 lea edi, [edi + 32] sub ecx, 16 - ja convertloop + jg convertloop pop edi pop esi @@ -305,7 +305,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y, "movdqa %%xmm1,0x10(%3) \n" "lea 0x20(%3),%3 \n" "sub $0x10,%4 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -340,7 +340,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y, "movdqa %%xmm2,0x10(%3) \n" "lea 0x20(%3),%3 \n" "sub $0x10,%4 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -1084,134 +1084,135 @@ int ConvertFromI420(const uint8* y, int y_stride, if (y == NULL || u == NULL || v == NULL || dst_sample == NULL) { return -1; } + int r = 0; switch (format) { // Single plane formats case FOURCC_YUY2: - I420ToYUY2(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, - width, height); + r = I420ToYUY2(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); break; case FOURCC_UYVY: - I420ToUYVY(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, - width, height); + r = I420ToUYVY(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); break; case FOURCC_V210: - I420ToV210(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : - (width + 47) / 48 * 128, - width, height); + r = I420ToV210(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : + (width + 47) / 48 * 128, + width, height); break; case FOURCC_RGBP: - I420ToRGB565(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, - width, height); + r = I420ToRGB565(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); break; case FOURCC_RGBO: - I420ToARGB1555(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, - width, height); + r = I420ToARGB1555(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); break; case FOURCC_R444: - I420ToARGB4444(y, y_stride, + r = I420ToARGB4444(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_24BG: + r = I420ToRGB24(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, + width, height); + break; + case FOURCC_RAW: + r = I420ToRAW(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, + width, height); + break; + case FOURCC_ARGB: + r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, + dst_sample_stride ? dst_sample_stride : width * 4, width, height); break; - case FOURCC_24BG: - I420ToRGB24(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 3, - width, height); - break; - case FOURCC_RAW: - I420ToRAW(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 3, - width, height); - break; - case FOURCC_ARGB: - I420ToARGB(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, - width, height); - break; case FOURCC_BGRA: - I420ToBGRA(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, - width, height); + r = I420ToBGRA(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, + width, height); break; case FOURCC_ABGR: - I420ToABGR(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, - width, height); + r = I420ToABGR(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, + width, height); break; case FOURCC_BGGR: - I420ToBayerBGGR(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width, - width, height); + r = I420ToBayerBGGR(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); break; case FOURCC_GBRG: - I420ToBayerGBRG(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width, - width, height); + r = I420ToBayerGBRG(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); break; case FOURCC_GRBG: - I420ToBayerGRBG(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width, - width, height); + r = I420ToBayerGRBG(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); break; case FOURCC_RGGB: - I420ToBayerRGGB(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width, - width, height); + r = I420ToBayerRGGB(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); break; case FOURCC_I400: - I400Copy(y, y_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width, - width, height); + r = I400Copy(y, y_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); break; // Triplanar formats // TODO(fbarchard): halfstride instead of halfwidth @@ -1228,13 +1229,13 @@ int ConvertFromI420(const uint8* y, int y_stride, dst_v = dst_sample + width * height; dst_u = dst_v + halfwidth * halfheight; } - I420Copy(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, width, - dst_u, halfwidth, - dst_v, halfwidth, - width, height); + r = I420Copy(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, halfwidth, + dst_v, halfwidth, + width, height); break; } case FOURCC_I422: @@ -1249,13 +1250,13 @@ int ConvertFromI420(const uint8* y, int y_stride, dst_v = dst_sample + width * height; dst_u = dst_v + halfwidth * height; } - I420ToI422(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, width, - dst_u, halfwidth, - dst_v, halfwidth, - width, height); + r = I420ToI422(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, halfwidth, + dst_v, halfwidth, + width, height); break; } case FOURCC_I444: @@ -1269,26 +1270,26 @@ int ConvertFromI420(const uint8* y, int y_stride, dst_v = dst_sample + width * height; dst_u = dst_v + width * height; } - I420ToI444(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, width, - dst_u, width, - dst_v, width, - width, height); + r = I420ToI444(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, width, + dst_v, width, + width, height); break; } case FOURCC_I411: { int quarterwidth = (width + 3) / 4; uint8* dst_u = dst_sample + width * height; uint8* dst_v = dst_u + quarterwidth * height; - I420ToI411(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, width, - dst_u, quarterwidth, - dst_v, quarterwidth, - width, height); + r = I420ToI411(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, quarterwidth, + dst_v, quarterwidth, + width, height); break; } @@ -1296,7 +1297,7 @@ int ConvertFromI420(const uint8* y, int y_stride, default: return -1; // unknown fourcc - return failure code. } - return 0; + return r; } #ifdef __cplusplus diff --git a/source/format_conversion.cc b/source/format_conversion.cc index 471ed52d4..5cedf2a8e 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -40,10 +40,10 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, movdqa xmm0, [eax] lea eax, [eax + 16] pshufb xmm0, xmm5 + sub ecx, 4 movd [edx], xmm0 lea edx, [edx + 4] - sub ecx, 4 - ja wloop + jg wloop ret } } @@ -60,10 +60,10 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, "movdqa (%0),%%xmm0 \n" "lea 0x10(%0),%0 \n" "pshufb %%xmm5,%%xmm0 \n" + "sub $0x4,%2 \n" "movd %%xmm0,(%1) \n" "lea 0x4(%1),%1 \n" - "sub $0x4,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 "+r"(pix) // %2 diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 13bbbc5bc..1b6763d74 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -685,7 +685,7 @@ static void SetRow8_NEON(uint8* dst, uint32 v32, int count) { "1: \n" "subs %1, %1, #16 \n" // 16 bytes per loop "vst1.u32 {q0}, [%0]! \n" // store - "bhi 1b \n" + "bgt 1b \n" : "+r"(dst), // %0 "+r"(count) // %1 : "r"(v32) // %2 @@ -738,7 +738,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width, rep stosd add edi, edx sub ebx, 1 - ja convertloop + jg convertloop pop ebp pop edi diff --git a/source/rotate.cc b/source/rotate.cc index 670114800..d62c36a7c 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -153,7 +153,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, sub ecx, 8 movq qword ptr [edx + esi], xmm7 lea edx, [edx + 2 * esi] - ja convertloop + jg convertloop pop ebp pop esi @@ -281,7 +281,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm0 lea ebx, [ebx + 2 * ebp] - ja convertloop + jg convertloop mov esp, [esp + 16] pop ebp @@ -366,7 +366,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, "sub $0x8,%2 \n" "movq %%xmm7,(%1,%4) \n" "lea (%1,%4,2),%1 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -493,7 +493,7 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, "lea (%edx,%esi,2),%edx \n" "movhpd %xmm0,(%ebx,%ebp,1) \n" "lea (%ebx,%ebp,2),%ebx \n" - "ja 1b \n" + "jg 1b \n" "mov 0x10(%esp),%esp \n" "pop %ebp \n" "pop %edi \n" @@ -629,7 +629,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, "sub $0x10,%2 \n" "movq %%xmm15,(%1,%4) \n" "lea (%1,%4,2),%1 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -737,7 +737,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, "lea (%1,%5,2),%1 \n" "movhpd %%xmm8,(%2,%6) \n" "lea (%2,%6,2),%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst_a), // %1 "+r"(dst_b), // %2 @@ -755,8 +755,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, static void TransposeWx8_C(const uint8* src, int src_stride, uint8* dst, int dst_stride, - int w) { - for (int i = 0; i < w; ++i) { + int width) { + for (int i = 0; i < width; ++i) { dst[0] = src[0 * src_stride]; dst[1] = src[1 * src_stride]; dst[2] = src[2 * src_stride]; @@ -888,9 +888,9 @@ void RotatePlane180(const uint8* src, int src_stride, static void TransposeUVWx8_C(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, - int w) { + int width) { int i; - for (i = 0; i < w; ++i) { + for (i = 0; i < width; ++i) { dst_a[0] = src[0 * src_stride + 0]; dst_b[0] = src[0 * src_stride + 1]; dst_a[1] = src[1 * src_stride + 0]; @@ -916,10 +916,10 @@ static void TransposeUVWx8_C(const uint8* src, int src_stride, static void TransposeUVWxH_C(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, - int w, int h) { + int width, int height) { int i, j; - for (i = 0; i < w * 2; i += 2) - for (j = 0; j < h; ++j) { + for (i = 0; i < width * 2; i += 2) + for (j = 0; j < height; ++j) { dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; } diff --git a/source/row_neon.cc b/source/row_neon.cc index fb4205a79..3ebebc113 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -73,7 +73,7 @@ YUVTORGB "vmov.u8 d23, #255 \n" "vst4.u8 {d20, d21, d22, d23}, [%3]! \n" "subs %4, %4, #8 \n" - "bhi 1b \n" + "bgt 1b \n" : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 @@ -106,7 +106,7 @@ YUVTORGB "vmov.u8 d19, #255 \n" "vst4.u8 {d19, d20, d21, d22}, [%3]! \n" "subs %4, %4, #8 \n" - "bhi 1b \n" + "bgt 1b \n" : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 @@ -139,7 +139,7 @@ YUVTORGB "vmov.u8 d23, #255 \n" "vst4.u8 {d20, d21, d22, d23}, [%3]! \n" "subs %4, %4, #8 \n" - "bhi 1b \n" + "bgt 1b \n" : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 @@ -163,7 +163,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { "subs %3, %3, #16 \n" // 16 processed per loop "vst1.u8 {q0}, [%1]! \n" // store U "vst1.u8 {q1}, [%2]! \n" // Store V - "bhi 1b \n" + "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -183,7 +183,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { "vldm %0!,{q0,q1,q2,q3} \n" // load 64 "subs %2, %2, #64 \n" // 64 processed per loop "vstm %1!,{q0,q1,q2,q3} \n" // store 64 - "bhi 1b \n" + "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(count) // %2 // Output registers diff --git a/source/row_posix.cc b/source/row_posix.cc index 3d781fdf0..06a06a52f 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -125,7 +125,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { "movdqa %%xmm1,0x10(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 @@ -140,14 +140,15 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { asm volatile ( "movdqa %3,%%xmm5 \n" + "sub %0,%1 \n" "1: \n" "movdqa (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" "pshufb %%xmm5,%%xmm0 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" - "ja 1b \n" + "movdqa %%xmm0,(%0,%1,1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + : "+r"(src_abgr), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 @@ -162,14 +163,14 @@ void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { asm volatile ( "movdqa %3,%%xmm5 \n" + "sub %0,%1 \n" "1: \n" "movdqa (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" "pshufb %%xmm5,%%xmm0 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" - "ja 1b \n" + "movdqa %%xmm0,(%0,%1,1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 @@ -206,10 +207,10 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { "pshufb %%xmm4,%%xmm3 \n" "movdqa %%xmm1,0x10(%1) \n" "por %%xmm5,%%xmm3 \n" + "sub $0x10,%2 \n" "movdqa %%xmm3,0x30(%1) \n" "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 @@ -246,10 +247,10 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { "pshufb %%xmm4,%%xmm3 \n" "movdqa %%xmm1,0x10(%1) \n" "por %%xmm5,%%xmm3 \n" + "sub $0x10,%2 \n" "movdqa %%xmm3,0x30(%1) \n" "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 @@ -298,7 +299,7 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "movdqa %%xmm2,0x10(%1,%0,2) \n" "lea 0x10(%0),%0 \n" "sub $0x8,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(pix) // %2 @@ -350,7 +351,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "movdqa %%xmm2,0x10(%1,%0,2) \n" "lea 0x10(%0),%0 \n" "sub $0x8,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(pix) // %2 @@ -389,7 +390,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "movdqa %%xmm1,0x10(%1,%0,2) \n" "lea 0x10(%0),%0 \n" "sub $0x8,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(pix) // %2 @@ -429,7 +430,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { "movdqa %%xmm2,0x20(%1) \n" "lea 0x30(%1),%1 \n" "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(pix) // %2 @@ -469,7 +470,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { "movdqa %%xmm2,0x20(%1) \n" "lea 0x30(%1),%1 \n" "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(pix) // %2 @@ -508,7 +509,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { "movq %%xmm0,(%1) \n" "lea 0x8(%1),%1 \n" "sub $0x4,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(pix) // %2 @@ -551,7 +552,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { "movq %%xmm0,(%1) \n" "lea 0x8(%1),%1 \n" "sub $0x4,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(pix) // %2 @@ -582,7 +583,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { "movq %%xmm0,(%1) \n" "lea 0x8(%1),%1 \n" "sub $0x4,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(pix) // %2 @@ -614,10 +615,10 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" "movdqa %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -650,10 +651,10 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -718,11 +719,11 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" "movlps %%xmm0,(%1) \n" "movhps %%xmm0,(%1,%2,1) \n" "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -786,11 +787,11 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" "movlps %%xmm0,(%1) \n" "movhps %%xmm0,(%1,%2,1) \n" "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -823,10 +824,10 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" "movdqa %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -859,10 +860,10 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -922,11 +923,11 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" "movlps %%xmm0,(%1) \n" "movhps %%xmm0,(%1,%2,1) \n" "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_bgra0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -990,11 +991,11 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra, "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" "movlps %%xmm0,(%1) \n" "movhps %%xmm0,(%1,%2,1) \n" "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_bgra0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1027,10 +1028,10 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" "movdqa %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -1063,10 +1064,10 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -1126,11 +1127,11 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" "movlps %%xmm0,(%1) \n" "movhps %%xmm0,(%1,%2,1) \n" "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_abgr0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1194,11 +1195,11 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" "movlps %%xmm0,(%1) \n" "movhps %%xmm0,(%1,%2,1) \n" "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_abgr0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1305,7 +1306,7 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm1,0x10(%3) \n" "lea 0x20(%3),%3 \n" "sub $0x8,%4 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 @@ -1340,7 +1341,7 @@ void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,0x10(%3) \n" "lea 0x20(%3),%3 \n" "sub $0x8,%4 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 @@ -1374,7 +1375,7 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf, "movdqa %%xmm1,0x10(%3) \n" "lea 0x20(%3),%3 \n" "sub $0x8,%4 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 @@ -1427,10 +1428,10 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, "punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm2 \n" "punpcklwd %%xmm2,%%xmm0 \n" + "sub $0x4,%4 \n" "movdqa %%xmm0,(%3) \n" "lea 0x10(%3),%3 \n" - "sub $0x4,%4 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 @@ -1479,7 +1480,7 @@ void YToARGBRow_SSE2(const uint8* y_buf, "lea 32(%1),%1 \n" "sub $0x8,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(y_buf), // %0 "+r"(rgb_buf), // %1 "+rm"(width) // %2 @@ -1509,7 +1510,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { "sub $0x10,%2 \n" "movdqa %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -1539,7 +1540,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { "sub $0x10,%2 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -1572,7 +1573,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, "movlpd %%xmm0,(%1) \n" "movhpd %%xmm0,(%1,%2) \n" "lea 8(%1),%1 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1608,7 +1609,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { "movdqa %%xmm2,(%1,%2) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%3 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1633,7 +1634,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { "movdqa %%xmm1,0x10(%0,%1) \n" "lea 0x20(%0),%0 \n" "sub $0x20,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(count) // %2 @@ -1676,7 +1677,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { "movdqa %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -1714,7 +1715,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, "movq %%xmm1,(%1,%2) \n" "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_y), // %2 @@ -1739,10 +1740,10 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, "pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -1782,7 +1783,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, "movq %%xmm1,(%1,%2) \n" "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_y), // %2 @@ -1804,10 +1805,10 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" "movdqa %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -1845,7 +1846,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, "movq %%xmm1,(%1,%2) \n" "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_y), // %2 @@ -1868,10 +1869,10 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -1909,7 +1910,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, "movq %%xmm1,(%1,%2) \n" "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_y), // %2 diff --git a/source/row_win.cc b/source/row_win.cc index 5bf422069..c538562ff 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -122,7 +122,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { movdqa [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 - ja convertloop + jg convertloop ret } } @@ -134,16 +134,16 @@ __asm { mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // pix movdqa xmm5, kShuffleMaskABGRToARGB + sub edx, eax align 16 convertloop: movdqa xmm0, [eax] - lea eax, [eax + 16] pshufb xmm0, xmm5 - movdqa [edx], xmm0 - lea edx, [edx + 16] sub ecx, 4 - ja convertloop + movdqa [eax + edx], xmm0 + lea eax, [eax + 16] + jg convertloop ret } } @@ -155,16 +155,16 @@ __asm { mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // pix movdqa xmm5, kShuffleMaskBGRAToARGB + sub edx, eax align 16 convertloop: movdqa xmm0, [eax] - lea eax, [eax + 16] pshufb xmm0, xmm5 - movdqa [edx], xmm0 - lea edx, [edx + 16] sub ecx, 4 - ja convertloop + movdqa [eax + edx], xmm0 + lea eax, [eax + 16] + jg convertloop ret } } @@ -200,10 +200,10 @@ __asm { pshufb xmm3, xmm4 movdqa [edx + 16], xmm1 por xmm3, xmm5 + sub ecx, 16 movdqa [edx + 48], xmm3 lea edx, [edx + 64] - sub ecx, 16 - ja convertloop + jg convertloop ret } } @@ -240,10 +240,10 @@ __asm { pshufb xmm3, xmm4 movdqa [edx + 16], xmm1 por xmm3, xmm5 + sub ecx, 16 movdqa [edx + 48], xmm3 lea edx, [edx + 64] - sub ecx, 16 - ja convertloop + jg convertloop ret } } @@ -300,7 +300,7 @@ __asm { movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB lea eax, [eax + 16] sub ecx, 8 - ja convertloop + jg convertloop ret } } @@ -354,7 +354,7 @@ __asm { movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB lea eax, [eax + 16] sub ecx, 8 - ja convertloop + jg convertloop ret } } @@ -394,7 +394,7 @@ __asm { movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB lea eax, [eax + 16] sub ecx, 8 - ja convertloop + jg convertloop ret } } @@ -433,7 +433,7 @@ __asm { movdqa [edx + 32], xmm2 // store 2 lea edx, [edx + 48] sub ecx, 16 - ja convertloop + jg convertloop ret } } @@ -472,7 +472,7 @@ __asm { movdqa [edx + 32], xmm2 // store 2 lea edx, [edx + 48] sub ecx, 16 - ja convertloop + jg convertloop ret } } @@ -510,7 +510,7 @@ __asm { movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 lea edx, [edx + 8] sub ecx, 4 - ja convertloop + jg convertloop ret } } @@ -553,7 +553,7 @@ __asm { movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 lea edx, [edx + 8] sub ecx, 4 - ja convertloop + jg convertloop ret } } @@ -583,7 +583,7 @@ __asm { movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 lea edx, [edx + 8] sub ecx, 4 - ja convertloop + jg convertloop ret } } @@ -618,7 +618,7 @@ __asm { movdqa [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 - ja convertloop + jg convertloop ret } } @@ -652,7 +652,7 @@ __asm { movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 - ja convertloop + jg convertloop ret } } @@ -686,7 +686,7 @@ __asm { movdqa [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 - ja convertloop + jg convertloop ret } } @@ -720,7 +720,7 @@ __asm { movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 - ja convertloop + jg convertloop ret } } @@ -754,7 +754,7 @@ __asm { movdqa [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 - ja convertloop + jg convertloop ret } } @@ -785,10 +785,10 @@ __asm { psrlw xmm2, 7 packuswb xmm0, xmm2 paddb xmm0, xmm5 + sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] - sub ecx, 16 - ja convertloop + jg convertloop ret } } @@ -847,11 +847,12 @@ __asm { paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values + sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] - sub ecx, 16 - ja convertloop + jg convertloop + pop edi pop esi ret @@ -916,11 +917,12 @@ __asm { paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values + sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] - sub ecx, 16 - ja convertloop + jg convertloop + pop edi pop esi ret @@ -981,11 +983,12 @@ __asm { paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values + sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] - sub ecx, 16 - ja convertloop + jg convertloop + pop edi pop esi ret @@ -1050,11 +1053,12 @@ __asm { paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values + sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] - sub ecx, 16 - ja convertloop + jg convertloop + pop edi pop esi ret @@ -1115,11 +1119,12 @@ __asm { paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values + sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] - sub ecx, 16 - ja convertloop + jg convertloop + pop edi pop esi ret @@ -1184,11 +1189,12 @@ __asm { paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values + sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] - sub ecx, 16 - ja convertloop + jg convertloop + pop edi pop esi ret @@ -1293,9 +1299,8 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf, movdqa [edx], xmm0 movdqa [edx + 16], xmm1 lea edx, [edx + 32] - sub ecx, 8 - ja convertloop + jg convertloop pop edi pop esi @@ -1334,9 +1339,8 @@ void I420ToBGRARow_SSSE3(const uint8* y_buf, movdqa [edx], xmm5 movdqa [edx + 16], xmm0 lea edx, [edx + 32] - sub ecx, 8 - ja convertloop + jg convertloop pop edi pop esi @@ -1375,9 +1379,8 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf, movdqa [edx], xmm2 movdqa [edx + 16], xmm1 lea edx, [edx + 32] - sub ecx, 8 - ja convertloop + jg convertloop pop edi pop esi @@ -1441,9 +1444,8 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, punpcklwd xmm0, xmm2 // BGRA 4 pixels movdqa [edx], xmm0 lea edx, [edx + 16] - sub ecx, 4 - ja convertloop + jg convertloop pop edi pop esi @@ -1490,9 +1492,8 @@ void YToARGBRow_SSE2(const uint8* y_buf, movdqa [edx], xmm0 movdqa [edx + 16], xmm1 lea edx, [edx + 32] - sub ecx, 8 - ja convertloop + jg convertloop ret } @@ -1523,7 +1524,7 @@ __asm { sub ecx, 16 movdqa [edx], xmm0 lea edx, [edx + 16] - ja convertloop + jg convertloop ret } } @@ -1553,7 +1554,7 @@ __asm { sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] - ja convertloop + jg convertloop ret } } @@ -1587,7 +1588,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, movlpd qword ptr [edx], xmm0 movhpd qword ptr [edx + edi], xmm0 lea edx, [edx + 8] - ja convertloop + jg convertloop pop edi ret @@ -1625,7 +1626,8 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { movdqa [edx + edi], xmm2 lea edx, [edx + 16] sub ecx, 16 - ja convertloop + jg convertloop + pop edi ret } @@ -1650,7 +1652,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { movdqa [eax + edx + 16], xmm1 lea eax, [eax + 32] sub ecx, 32 - ja convertloop + jg convertloop ret } } @@ -1693,10 +1695,10 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, pand xmm0, xmm5 // even bytes are Y pand xmm1, xmm5 packuswb xmm0, xmm1 + sub ecx, 16 movdqa [edx], xmm0 lea edx, [edx + 16] - sub ecx, 16 - ja convertloop + jg convertloop ret } } @@ -1737,7 +1739,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, movq qword ptr [edx + edi], xmm1 lea edx, [edx + 8] sub ecx, 16 - ja convertloop + jg convertloop pop edi pop esi @@ -1763,10 +1765,10 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, pand xmm0, xmm5 // even bytes are Y pand xmm1, xmm5 packuswb xmm0, xmm1 + sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] - sub ecx, 16 - ja convertloop + jg convertloop ret } } @@ -1807,7 +1809,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, movq qword ptr [edx + edi], xmm1 lea edx, [edx + 8] sub ecx, 16 - ja convertloop + jg convertloop pop edi pop esi @@ -1831,10 +1833,10 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, psrlw xmm0, 8 // odd bytes are Y psrlw xmm1, 8 packuswb xmm0, xmm1 + sub ecx, 16 movdqa [edx], xmm0 lea edx, [edx + 16] - sub ecx, 16 - ja convertloop + jg convertloop ret } } @@ -1875,7 +1877,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, movq qword ptr [edx + edi], xmm1 lea edx, [edx + 8] sub ecx, 16 - ja convertloop + jg convertloop pop edi pop esi @@ -1899,10 +1901,10 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, psrlw xmm0, 8 // odd bytes are Y psrlw xmm1, 8 packuswb xmm0, xmm1 + sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] - sub ecx, 16 - ja convertloop + jg convertloop ret } } @@ -1943,7 +1945,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, movq qword ptr [edx + edi], xmm1 lea edx, [edx + 8] sub ecx, 16 - ja convertloop + jg convertloop pop edi pop esi diff --git a/source/scale.cc b/source/scale.cc index 0764ab751..f3d6d771c 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -64,7 +64,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */, "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1 "vst1.u8 {q0}, [%1]! \n" // store even pixels "subs %2, %2, #16 \n" // 16 processed per loop - "bhi 1b \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -88,7 +88,7 @@ void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, "vrshrn.u16 d1, q1, #2 \n" "vst1.u8 {q0}, [%2]! \n" "subs %3, %3, #16 \n" // 16 processed per loop - "bhi 1b \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -109,7 +109,7 @@ static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */, "vst1.u32 {d0[1]}, [%1]! \n" "subs %2, #4 \n" - "bhi 1b \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -143,7 +143,7 @@ static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride, "vst1.u32 {d0[0]}, [%1]! \n" "subs %2, #4 \n" - "bhi 1b \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -165,7 +165,7 @@ static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */, "vmov d2, d3 \n" // order needs to be d0, d1, d2 "vst3.u8 {d0, d1, d2}, [%1]! \n" "subs %2, #24 \n" - "bhi 1b \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -219,7 +219,7 @@ static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride, "vst3.u8 {d0, d1, d2}, [%1]! \n" "subs %2, #24 \n" - "bhi 1b \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -258,7 +258,7 @@ static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride, "vst3.u8 {d0, d1, d2}, [%1]! \n" "subs %2, #24 \n" - "bhi 1b \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -292,7 +292,7 @@ static void ScaleRowDown38_NEON(const uint8* src_ptr, int, "vst1.u8 {d4}, [%1]! \n" "vst1.u32 {d5[0]}, [%1]! \n" "subs %2, #12 \n" - "bhi 1b \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -397,7 +397,7 @@ static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride, "vst1.u8 {d3}, [%1]! \n" "vst1.u32 {d4[0]}, [%1]! \n" "subs %2, #12 \n" - "bhi 1b \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -492,7 +492,7 @@ static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride, "vst1.u8 {d3}, [%1]! \n" "vst1.u32 {d4[0]}, [%1]! \n" "subs %2, #12 \n" - "bhi 1b \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -529,14 +529,14 @@ static void ScaleFilterRows_NEON(uint8* dst_ptr, "vrshrn.u16 d0, q13, #8 \n" "vrshrn.u16 d1, q14, #8 \n" "vst1.u8 {q0}, [%0]! \n" - "bhi 1b \n" + "bgt 1b \n" "b 4f \n" "2: \n" "vld1.u8 {q0}, [%1]! \n" "subs %3, #16 \n" "vst1.u8 {q0}, [%0]! \n" - "bhi 2b \n" + "bgt 2b \n" "b 4f \n" "3: \n" @@ -545,7 +545,7 @@ static void ScaleFilterRows_NEON(uint8* dst_ptr, "subs %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vst1.u8 {q0}, [%0]! \n" - "bhi 3b \n" + "bgt 3b \n" "4: \n" "vst1.u8 {d1[7]}, [%0] \n" : "+r"(dst_ptr), // %0 @@ -697,7 +697,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, sub ecx, 16 movdqa [edx], xmm0 lea edx, [edx + 16] - ja wloop + jg wloop ret } @@ -739,7 +739,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, sub ecx, 16 movdqa [edx], xmm0 lea edx, [edx + 16] - ja wloop + jg wloop pop esi ret @@ -772,7 +772,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, sub ecx, 8 movq qword ptr [edx], xmm0 lea edx, [edx + 8] - ja wloop + jg wloop ret } @@ -831,7 +831,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, sub ecx, 8 movq qword ptr [edx], xmm0 lea edx, [edx + 8] - ja wloop + jg wloop pop edi pop esi @@ -866,7 +866,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, sub ecx, 4 movd dword ptr [edx], xmm0 lea edx, [edx + 4] - ja wloop + jg wloop ret } @@ -936,7 +936,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, sub ecx, 4 movd dword ptr [edx], xmm0 lea edx, [edx + 4] - ja wloop + jg wloop pop ebp pop edi @@ -979,7 +979,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, movq qword ptr [edx + 16], xmm2 lea edx, [edx + 24] sub ecx, 24 - ja wloop + jg wloop ret } @@ -1050,7 +1050,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, sub ecx, 24 movq qword ptr [edx + 16], xmm0 lea edx, [edx + 24] - ja wloop + jg wloop pop esi ret @@ -1111,7 +1111,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, sub ecx, 24 movq qword ptr [edx + 16], xmm0 lea edx, [edx+24] - ja wloop + jg wloop pop esi ret @@ -1147,7 +1147,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, movhlps xmm1, xmm0 movd [edx + 8], xmm1 lea edx, [edx + 12] - ja xloop + jg xloop ret } @@ -1212,7 +1212,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, pextrw ebx, xmm2, 2 mov [edx + 4], bx lea edx, [edx + 6] - ja xloop + jg xloop pop ebx pop esi @@ -1258,7 +1258,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, pextrw ebx, xmm0, 2 mov [edx + 4], bx lea edx, [edx + 6] - ja xloop + jg xloop pop ebx pop esi @@ -1310,14 +1310,14 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, paddusw xmm0, xmm2 // sum 16 words paddusw xmm1, xmm3 sub ebp, 1 - ja yloop + jg yloop ydone: movdqa [edi], xmm0 movdqa [edi + 16], xmm1 lea edi, [edi + 32] sub ecx, 16 - ja xloop + jg xloop pop ebp pop ebx @@ -1379,7 +1379,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, sub ecx, 16 movdqa [esi + edi], xmm0 lea esi, [esi + 16] - ja xloop + jg xloop mov al, [esi + edi - 1] mov [esi + edi], al @@ -1393,7 +1393,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, sub ecx, 16 movdqa [esi + edi], xmm0 lea esi, [esi + 16] - ja xloop1 + jg xloop1 mov al, [esi + edi - 1] mov [esi + edi], al @@ -1408,7 +1408,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, sub ecx, 16 movdqa [esi + edi], xmm0 lea esi, [esi + 16] - ja xloop2 + jg xloop2 mov al, [esi + edi - 1] mov [esi + edi], al @@ -1460,7 +1460,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, sub ecx, 16 movdqa [esi + edi], xmm0 lea esi, [esi + 16] - ja xloop + jg xloop mov al, [esi + edi - 1] mov [esi + edi], al @@ -1474,7 +1474,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, sub ecx, 16 movdqa [esi + edi], xmm0 lea esi, [esi + 16] - ja xloop1 + jg xloop1 mov al, [esi + edi - 1] mov [esi + edi], al @@ -1489,7 +1489,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, sub ecx, 16 movdqa [esi + edi], xmm0 lea esi, [esi + 16] - ja xloop2 + jg xloop2 mov al, [esi + edi - 1] mov [esi + edi], al @@ -1542,7 +1542,7 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, sub ecx, 24 movq qword ptr [edx+16], xmm0 lea edx, [edx+24] - ja wloop + jg wloop ret } } @@ -1568,7 +1568,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, "movdqa %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -1602,7 +1602,7 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, "movdqa %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -1628,7 +1628,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, "movq %%xmm0,(%1) \n" "lea 0x8(%1),%1 \n" "sub $0x8,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -1677,7 +1677,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, "movq %%xmm0,(%1) \n" "lea 0x8(%1),%1 \n" "sub $0x8,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -1708,7 +1708,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, "movd %%xmm0,(%1) \n" "lea 0x4(%1),%1 \n" "sub $0x4,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -1744,14 +1744,14 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, "paddusw %%xmm2,%%xmm0 \n" "paddusw %%xmm3,%%xmm1 \n" "sub $0x1,%2 \n" - "ja 2b \n" + "jg 2b \n" "3: \n" "movdqa %%xmm0,(%1) \n" "movdqa %%xmm1,0x10(%1) \n" "lea 0x10(%3),%0 \n" "lea 0x20(%1),%1 \n" "sub $0x10,%4 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(tmp_height), // %2 @@ -1823,7 +1823,7 @@ extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, "sub $0x4,%ecx \n" "movd %xmm0,(%edi) \n" "lea 0x4(%edi),%edi \n" - "ja 1b \n" + "jg 1b \n" "popa \n" "ret \n" ); @@ -1857,7 +1857,7 @@ extern "C" void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, "movq %xmm2,0x10(%edi) \n" "lea 0x18(%edi),%edi \n" "sub $0x18,%ecx \n" - "ja 1b \n" + "jg 1b \n" "popa \n" "ret \n" ); @@ -1910,7 +1910,7 @@ extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, "sub $0x18,%ecx \n" "movq %xmm0,0x10(%edi) \n" "lea 0x18(%edi),%edi \n" - "ja 1b \n" + "jg 1b \n" "popa \n" "ret \n" @@ -1967,7 +1967,7 @@ extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, "sub $0x18,%ecx \n" "movq %xmm0,0x10(%edi) \n" "lea 0x18(%edi),%edi \n" - "ja 1b \n" + "jg 1b \n" "popa \n" "ret \n" ); @@ -1997,7 +1997,7 @@ extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, "sub $0xc,%ecx \n" "movd %xmm1,0x8(%edi) \n" "lea 0xc(%edi),%edi \n" - "ja 1b \n" + "jg 1b \n" "popa \n" "ret \n" ); @@ -2054,7 +2054,7 @@ extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, "mov %ax,0x4(%edi) \n" "lea 0x6(%edi),%edi \n" "sub $0x6,%ecx \n" - "ja 1b \n" + "jg 1b \n" "popa \n" "ret \n" ); @@ -2091,7 +2091,7 @@ extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, "mov %ax,0x4(%edi) \n" "lea 0x6(%edi),%edi \n" "sub $0x6,%ecx \n" - "ja 1b \n" + "jg 1b \n" "popa \n" "ret \n" ); @@ -2147,7 +2147,7 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr, "sub $0x10,%ecx \n" "movdqa %xmm0,(%esi,%edi,1) \n" "lea 0x10(%esi),%esi \n" - "ja 1b \n" + "jg 1b \n" "mov -0x1(%esi,%edi,1),%al \n" "mov %al,(%esi,%edi,1) \n" @@ -2160,7 +2160,7 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr, "sub $0x10,%ecx \n" "movdqa %xmm0,(%esi,%edi,1) \n" "lea 0x10(%esi),%esi \n" - "ja 2b \n" + "jg 2b \n" "mov -0x1(%esi,%edi,1),%al \n" "mov %al,(%esi,%edi,1) \n" @@ -2174,7 +2174,7 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr, "sub $0x10,%ecx \n" "movdqa %xmm0,(%esi,%edi,1) \n" "lea 0x10(%esi),%esi \n" - "ja 3b \n" + "jg 3b \n" "mov -0x1(%esi,%edi,1),%al \n" "mov %al,(%esi,%edi,1) \n" @@ -2224,7 +2224,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, "sub $0x10,%ecx \n" "movdqa %xmm0,(%esi,%edi,1) \n" "lea 0x10(%esi),%esi \n" - "ja 1b \n" + "jg 1b \n" "mov -0x1(%esi,%edi,1),%al \n" "mov %al,(%esi,%edi,1) \n" @@ -2237,7 +2237,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, "sub $0x10,%ecx \n" "movdqa %xmm0,(%esi,%edi,1) \n" "lea 0x10(%esi),%esi \n" - "ja 2b \n" + "jg 2b \n" "mov -0x1(%esi,%edi,1),%al \n" "mov %al,(%esi,%edi,1) \n" @@ -2251,7 +2251,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, "sub $0x10,%ecx \n" "movdqa %xmm0,(%esi,%edi,1) \n" "lea 0x10(%esi),%esi \n" - "ja 3b \n" + "jg 3b \n" "mov -0x1(%esi,%edi,1),%al \n" "mov %al,(%esi,%edi,1) \n" @@ -2310,7 +2310,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, "movd %%xmm0,(%1) \n" "lea 0x4(%1),%1 \n" "sub $0x4,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -2340,7 +2340,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, "movq %%xmm2,0x10(%1) \n" "lea 0x18(%1),%1 \n" "sub $0x18,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -2392,7 +2392,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, "movq %%xmm0,0x10(%1) \n" "lea 0x18(%1),%1 \n" "sub $0x18,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -2452,7 +2452,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, "movq %%xmm0,0x10(%1) \n" "lea 0x18(%1),%1 \n" "sub $0x18,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -2486,7 +2486,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, "movd %%xmm1,0x8(%1) \n" "lea 0xc(%1),%1 \n" "sub $0xc,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -2541,7 +2541,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, "mov %%ax,0x4(%1) \n" "lea 0x6(%1),%1 \n" "sub $0x6,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -2578,7 +2578,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, "mov %%ax,0x4(%1) \n" "lea 0x6(%1),%1 \n" "sub $0x6,%2 \n" - "ja 1b \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -2604,7 +2604,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, "movdqa %%xmm0,(%0) \n" "lea 0x10(%0),%0 \n" "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" "mov -0x1(%0),%%al \n" "mov %%al,(%0) \n" : "+r"(dst_ptr), // %0 @@ -2624,7 +2624,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, "movdqa %%xmm0,(%0) \n" "lea 0x10(%0),%0 \n" "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" "mov -0x1(%0),%%al \n" "mov %%al,(%0) \n" : "+r"(dst_ptr), // %0 @@ -2668,7 +2668,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, "movdqa %%xmm0,(%0) \n" "lea 0x10(%0),%0 \n" "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" "mov -0x1(%0),%%al \n" "mov %%al,(%0) \n" : "+r"(dst_ptr), // %0 @@ -2695,7 +2695,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, "movdqa %%xmm0,(%0) \n" "lea 0x10(%0),%0 \n" "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" "mov -0x1(%0),%%al \n" "mov %%al,(%0) \n" : "+r"(dst_ptr), // %0 @@ -2715,7 +2715,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, "movdqa %%xmm0,(%0) \n" "lea 0x10(%0),%0 \n" "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" "mov -0x1(%0),%%al \n" "mov %%al,(%0) \n" : "+r"(dst_ptr), // %0 @@ -2750,7 +2750,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, "movdqa %%xmm0,(%0) \n" "lea 0x10(%0),%0 \n" "sub $0x10,%2 \n" - "ja 1b \n" + "jg 1b \n" "mov -0x1(%0),%%al \n" "mov %%al,(%0) \n" : "+r"(dst_ptr), // %0