From 18184fd19dba08d6567357e3913285a779e4b9f3 Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Mon, 12 Mar 2012 18:53:19 +0000
Subject: [PATCH] switch looping to jg from ja to allow non-multiple of 16 to
 underflow to a negative BUG=none TEST=none Review URL:
 https://webrtc-codereview.appspot.com/453001

git-svn-id: http://libyuv.googlecode.com/svn/trunk@214 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium             |   2 +-
 include/libyuv/version.h    |   2 +-
 source/compare.cc           |  10 +-
 source/convert.cc           |  16 +--
 source/convert_from.cc      | 261 ++++++++++++++++++------------------
 source/format_conversion.cc |   8 +-
 source/planar_functions.cc  |   4 +-
 source/rotate.cc            |  26 ++--
 source/row_neon.cc          |  10 +-
 source/row_posix.cc         | 135 ++++++++++---------
 source/row_win.cc           | 136 ++++++++++---------
 source/scale.cc             | 134 +++++++++---------
 12 files changed, 374 insertions(+), 370 deletions(-)

diff --git a/README.chromium b/README.chromium
index c6991f0fe..efb9f69ea 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 213
+Version: 214
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 18f1f2f75..4d0444779 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 213
+#define LIBYUV_VERSION 214
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
 
diff --git a/source/compare.cc b/source/compare.cc
index 7d188d082..c57a59162 100644
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -58,7 +58,7 @@ static uint32 SumSquareError_NEON(const uint8* src_a,
     "vmlal.s16  q8, d5, d5                     \n"
     "vmlal.s16  q10, d7, d7                    \n"
     "subs       %2, %2, #16                    \n"
-    "bhi        1b                             \n"
+    "bgt        1b                             \n"
 
     "vadd.u32   q7, q7, q8                     \n"
     "vadd.u32   q9, q9, q10                    \n"
@@ -94,6 +94,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
     movdqa     xmm1, [eax]
     movdqa     xmm2, [eax + edx]
     lea        eax,  [eax + 16]
+    sub        ecx, 16
     movdqa     xmm3, xmm1
     psubusb    xmm1, xmm2
     psubusb    xmm2, xmm3
@@ -105,8 +106,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
     pmaddwd    xmm2, xmm2
     paddd      xmm0, xmm1
     paddd      xmm0, xmm2
-    sub        ecx, 16
-    ja         wloop
+    jg         wloop
 
     pshufd     xmm1, xmm0, 0EEh
     paddd      xmm0, xmm1
@@ -131,6 +131,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
     "movdqa    (%0),%%xmm1                     \n"
     "movdqa    (%0,%1,1),%%xmm2                \n"
     "lea       0x10(%0),%0                     \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm1,%%xmm3                   \n"
     "psubusb   %%xmm2,%%xmm1                   \n"
     "psubusb   %%xmm3,%%xmm2                   \n"
@@ -142,8 +143,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
     "pmaddwd   %%xmm2,%%xmm2                   \n"
     "paddd     %%xmm1,%%xmm0                   \n"
     "paddd     %%xmm2,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
 
     "pshufd    $0xee,%%xmm0,%%xmm1             \n"
     "paddd     %%xmm1,%%xmm0                   \n"
diff --git a/source/convert.cc b/source/convert.cc
index 02e0a06f7..af4d2693b 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -77,10 +77,10 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
   convertloop:
     movdqa     xmm0, [eax]
     pavgb      xmm0, [eax + edx]
+    sub        ecx, 16
     movdqa     [eax + edi], xmm0
     lea        eax,  [eax + 16]
-    sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
     pop        edi
     ret
   }
@@ -95,10 +95,10 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
 "1:                                            \n"
   "movdqa     (%0),%%xmm0                      \n"
   "pavgb      (%0,%3),%%xmm0                   \n"
+  "sub        $0x10,%2                         \n"
   "movdqa     %%xmm0,(%0,%1)                   \n"
   "lea        0x10(%0),%0                      \n"
-  "sub        $0x10,%2                         \n"
-  "ja         1b                               \n"
+  "jg         1b                               \n"
   : "+r"(src_uv),  // %0
     "+r"(dst_uv),  // %1
     "+r"(pix)      // %2
@@ -495,10 +495,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
     lea        esi, [esi + 8]
     psrlw      xmm1, 8     // V
     packuswb   xmm1, xmm1
+    sub        ecx, 16
     movq       qword ptr [edi], xmm1
     lea        edi, [edi + 8]
-    sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
 
     pop        edi
     pop        esi
@@ -534,10 +534,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
   "lea        0x8(%2),%2                       \n"
   "psrlw      $0x8,%%xmm1                      \n"
   "packuswb   %%xmm1,%%xmm1                    \n"
+  "sub        $0x10,%4                         \n"
   "movq       %%xmm1,(%3)                      \n"
   "lea        0x8(%3),%3                       \n"
-  "sub        $0x10,%4                         \n"
-  "ja         1b                               \n"
+  "jg         1b                               \n"
   : "+r"(src_yuy2),    // %0
     "+r"(dst_y),       // %1
     "+r"(dst_u),       // %2
diff --git a/source/convert_from.cc b/source/convert_from.cc
index 0893eed71..7e41e2f8b 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -237,7 +237,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
     movdqa     [edi + 16], xmm1
     lea        edi, [edi + 32]
     sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
 
     pop        edi
     pop        esi
@@ -276,7 +276,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
     movdqa     [edi + 16], xmm2
     lea        edi, [edi + 32]
     sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
 
     pop        edi
     pop        esi
@@ -305,7 +305,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
     "movdqa    %%xmm1,0x10(%3)                   \n"
     "lea       0x20(%3),%3                       \n"
     "sub       $0x10,%4                          \n"
-    "ja         1b                               \n"
+    "jg         1b                               \n"
     : "+r"(src_y),  // %0
       "+r"(src_u),  // %1
       "+r"(src_v),  // %2
@@ -340,7 +340,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
     "movdqa    %%xmm2,0x10(%3)                   \n"
     "lea       0x20(%3),%3                       \n"
     "sub       $0x10,%4                          \n"
-    "ja         1b                               \n"
+    "jg         1b                               \n"
     : "+r"(src_y),  // %0
       "+r"(src_u),  // %1
       "+r"(src_v),  // %2
@@ -1084,134 +1084,135 @@ int ConvertFromI420(const uint8* y, int y_stride,
   if (y == NULL || u == NULL || v == NULL || dst_sample == NULL) {
     return -1;
   }
+  int r = 0;
   switch (format) {
     // Single plane formats
     case FOURCC_YUY2:
-      I420ToYUY2(y, y_stride,
-                 u, u_stride,
-                 v, v_stride,
-                 dst_sample,
-                 dst_sample_stride ? dst_sample_stride : width * 2,
-                 width, height);
+      r = I420ToYUY2(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2,
+                     width, height);
       break;
     case FOURCC_UYVY:
-      I420ToUYVY(y, y_stride,
-                 u, u_stride,
-                 v, v_stride,
-                 dst_sample,
-                 dst_sample_stride ? dst_sample_stride : width * 2,
-                 width, height);
+      r = I420ToUYVY(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2,
+                     width, height);
       break;
     case FOURCC_V210:
-      I420ToV210(y, y_stride,
-                 u, u_stride,
-                 v, v_stride,
-                 dst_sample,
-                 dst_sample_stride ? dst_sample_stride :
-                     (width + 47) / 48 * 128,
-                 width, height);
+      r = I420ToV210(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride :
+                         (width + 47) / 48 * 128,
+                     width, height);
       break;
     case FOURCC_RGBP:
-      I420ToRGB565(y, y_stride,
-                   u, u_stride,
-                   v, v_stride,
-                   dst_sample,
-                   dst_sample_stride ? dst_sample_stride : width * 2,
-                   width, height);
+      r = I420ToRGB565(y, y_stride,
+                       u, u_stride,
+                       v, v_stride,
+                       dst_sample,
+                       dst_sample_stride ? dst_sample_stride : width * 2,
+                       width, height);
       break;
     case FOURCC_RGBO:
-      I420ToARGB1555(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 2,
-                     width, height);
+      r = I420ToARGB1555(y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         dst_sample,
+                         dst_sample_stride ? dst_sample_stride : width * 2,
+                         width, height);
       break;
     case FOURCC_R444:
-      I420ToARGB4444(y, y_stride,
+      r = I420ToARGB4444(y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         dst_sample,
+                         dst_sample_stride ? dst_sample_stride : width * 2,
+                         width, height);
+      break;
+    case FOURCC_24BG:
+      r = I420ToRGB24(y, y_stride,
+                      u, u_stride,
+                      v, v_stride,
+                      dst_sample,
+                      dst_sample_stride ? dst_sample_stride : width * 3,
+                      width, height);
+      break;
+    case FOURCC_RAW:
+      r = I420ToRAW(y, y_stride,
+                    u, u_stride,
+                    v, v_stride,
+                    dst_sample,
+                    dst_sample_stride ? dst_sample_stride : width * 3,
+                    width, height);
+      break;
+    case FOURCC_ARGB:
+      r = I420ToARGB(y, y_stride,
                      u, u_stride,
                      v, v_stride,
                      dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 2,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
                      width, height);
       break;
-    case FOURCC_24BG:
-      I420ToRGB24(y, y_stride,
-                  u, u_stride,
-                  v, v_stride,
-                  dst_sample,
-                  dst_sample_stride ? dst_sample_stride : width * 3,
-                  width, height);
-      break;
-    case FOURCC_RAW:
-      I420ToRAW(y, y_stride,
-                u, u_stride,
-                v, v_stride,
-                dst_sample,
-                dst_sample_stride ? dst_sample_stride : width * 3,
-                width, height);
-      break;
-    case FOURCC_ARGB:
-      I420ToARGB(y, y_stride,
-                 u, u_stride,
-                 v, v_stride,
-                 dst_sample,
-                 dst_sample_stride ? dst_sample_stride : width * 4,
-                 width, height);
-      break;
     case FOURCC_BGRA:
-      I420ToBGRA(y, y_stride,
-                 u, u_stride,
-                 v, v_stride,
-                 dst_sample,
-                 dst_sample_stride ? dst_sample_stride : width * 4,
-                 width, height);
+      r = I420ToBGRA(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
       break;
     case FOURCC_ABGR:
-      I420ToABGR(y, y_stride,
-                 u, u_stride,
-                 v, v_stride,
-                 dst_sample,
-                 dst_sample_stride ? dst_sample_stride : width * 4,
-                 width, height);
+      r = I420ToABGR(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
       break;
     case FOURCC_BGGR:
-      I420ToBayerBGGR(y, y_stride,
-                      u, u_stride,
-                      v, v_stride,
-                      dst_sample,
-                      dst_sample_stride ? dst_sample_stride : width,
-                      width, height);
+      r = I420ToBayerBGGR(y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          dst_sample,
+                          dst_sample_stride ? dst_sample_stride : width,
+                          width, height);
       break;
     case FOURCC_GBRG:
-      I420ToBayerGBRG(y, y_stride,
-                      u, u_stride,
-                      v, v_stride,
-                      dst_sample,
-                      dst_sample_stride ? dst_sample_stride : width,
-                      width, height);
+      r = I420ToBayerGBRG(y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          dst_sample,
+                          dst_sample_stride ? dst_sample_stride : width,
+                          width, height);
       break;
     case FOURCC_GRBG:
-      I420ToBayerGRBG(y, y_stride,
-                      u, u_stride,
-                      v, v_stride,
-                      dst_sample,
-                      dst_sample_stride ? dst_sample_stride : width,
-                      width, height);
+      r = I420ToBayerGRBG(y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          dst_sample,
+                          dst_sample_stride ? dst_sample_stride : width,
+                          width, height);
       break;
     case FOURCC_RGGB:
-      I420ToBayerRGGB(y, y_stride,
-                      u, u_stride,
-                      v, v_stride,
-                      dst_sample,
-                      dst_sample_stride ? dst_sample_stride : width,
-                      width, height);
+      r = I420ToBayerRGGB(y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          dst_sample,
+                          dst_sample_stride ? dst_sample_stride : width,
+                          width, height);
       break;
     case FOURCC_I400:
-      I400Copy(y, y_stride,
-               dst_sample,
-               dst_sample_stride ? dst_sample_stride : width,
-               width, height);
+      r = I400Copy(y, y_stride,
+                   dst_sample,
+                   dst_sample_stride ? dst_sample_stride : width,
+                   width, height);
       break;
     // Triplanar formats
     // TODO(fbarchard): halfstride instead of halfwidth
@@ -1228,13 +1229,13 @@ int ConvertFromI420(const uint8* y, int y_stride,
         dst_v = dst_sample + width * height;
         dst_u = dst_v + halfwidth * halfheight;
       }
-      I420Copy(y, y_stride,
-               u, u_stride,
-               v, v_stride,
-               dst_sample, width,
-               dst_u, halfwidth,
-               dst_v, halfwidth,
-               width, height);
+      r = I420Copy(y, y_stride,
+                   u, u_stride,
+                   v, v_stride,
+                   dst_sample, width,
+                   dst_u, halfwidth,
+                   dst_v, halfwidth,
+                   width, height);
       break;
     }
     case FOURCC_I422:
@@ -1249,13 +1250,13 @@ int ConvertFromI420(const uint8* y, int y_stride,
         dst_v = dst_sample + width * height;
         dst_u = dst_v + halfwidth * height;
       }
-      I420ToI422(y, y_stride,
-                 u, u_stride,
-                 v, v_stride,
-                 dst_sample, width,
-                 dst_u, halfwidth,
-                 dst_v, halfwidth,
-                 width, height);
+      r = I420ToI422(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, halfwidth,
+                     dst_v, halfwidth,
+                     width, height);
       break;
     }
     case FOURCC_I444:
@@ -1269,26 +1270,26 @@ int ConvertFromI420(const uint8* y, int y_stride,
         dst_v = dst_sample + width * height;
         dst_u = dst_v + width * height;
       }
-      I420ToI444(y, y_stride,
-                 u, u_stride,
-                 v, v_stride,
-                 dst_sample, width,
-                 dst_u, width,
-                 dst_v, width,
-                 width, height);
+      r = I420ToI444(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, width,
+                     dst_v, width,
+                     width, height);
       break;
     }
     case FOURCC_I411: {
       int quarterwidth = (width + 3) / 4;
       uint8* dst_u = dst_sample + width * height;
       uint8* dst_v = dst_u + quarterwidth * height;
-      I420ToI411(y, y_stride,
-                 u, u_stride,
-                 v, v_stride,
-                 dst_sample, width,
-                 dst_u, quarterwidth,
-                 dst_v, quarterwidth,
-                 width, height);
+      r = I420ToI411(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, quarterwidth,
+                     dst_v, quarterwidth,
+                     width, height);
       break;
     }
 
@@ -1296,7 +1297,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
     default:
       return -1;  // unknown fourcc - return failure code.
   }
-  return 0;
+  return r;
 }
 
 #ifdef __cplusplus
diff --git a/source/format_conversion.cc b/source/format_conversion.cc
index 471ed52d4..5cedf2a8e 100644
--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@@ -40,10 +40,10 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
     movdqa     xmm0, [eax]
     lea        eax, [eax + 16]
     pshufb     xmm0, xmm5
+    sub        ecx, 4
     movd       [edx], xmm0
     lea        edx, [edx + 4]
-    sub        ecx, 4
-    ja         wloop
+    jg         wloop
     ret
   }
 }
@@ -60,10 +60,10 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
     "movdqa (%0),%%xmm0                        \n"
     "lea    0x10(%0),%0                        \n"
     "pshufb %%xmm5,%%xmm0                      \n"
+    "sub    $0x4,%2                            \n"
     "movd   %%xmm0,(%1)                        \n"
     "lea    0x4(%1),%1                         \n"
-    "sub    $0x4,%2                            \n"
-    "ja     1b                                 \n"
+    "jg     1b                                 \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_bayer), // %1
     "+r"(pix)        // %2
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 13bbbc5bc..1b6763d74 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -685,7 +685,7 @@ static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
     "1:                                        \n"
     "subs      %1, %1, #16                     \n"  // 16 bytes per loop
     "vst1.u32  {q0}, [%0]!                     \n"  // store
-    "bhi       1b                              \n"
+    "bgt       1b                              \n"
   : "+r"(dst),  // %0
     "+r"(count) // %1
   : "r"(v32)    // %2
@@ -738,7 +738,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
     rep stosd
     add        edi, edx
     sub        ebx, 1
-    ja         convertloop
+    jg         convertloop
 
     pop        ebp
     pop        edi
diff --git a/source/rotate.cc b/source/rotate.cc
index 670114800..d62c36a7c 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -153,7 +153,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
     sub       ecx, 8
     movq      qword ptr [edx + esi], xmm7
     lea       edx, [edx + 2 * esi]
-    ja        convertloop
+    jg        convertloop
 
     pop       ebp
     pop       esi
@@ -281,7 +281,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     lea       edx, [edx + 2 * esi]
     movhpd    qword ptr [ebx + ebp], xmm0
     lea       ebx, [ebx + 2 * ebp]
-    ja        convertloop
+    jg        convertloop
 
     mov       esp, [esp + 16]
     pop       ebp
@@ -366,7 +366,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
     "sub        $0x8,%2                          \n"
     "movq       %%xmm7,(%1,%4)                   \n"
     "lea        (%1,%4,2),%1                     \n"
-    "ja         1b                               \n"
+    "jg         1b                               \n"
     : "+r"(src),    // %0
       "+r"(dst),    // %1
       "+r"(width)   // %2
@@ -493,7 +493,7 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     "lea    (%edx,%esi,2),%edx                 \n"
     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
     "lea    (%ebx,%ebp,2),%ebx                 \n"
-    "ja     1b                                 \n"
+    "jg     1b                                 \n"
     "mov    0x10(%esp),%esp                    \n"
     "pop    %ebp                               \n"
     "pop    %edi                               \n"
@@ -629,7 +629,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
   "sub        $0x10,%2                         \n"
   "movq       %%xmm15,(%1,%4)                  \n"
   "lea        (%1,%4,2),%1                     \n"
-  "ja         1b                               \n"
+  "jg         1b                               \n"
   : "+r"(src),    // %0
     "+r"(dst),    // %1
     "+r"(width)   // %2
@@ -737,7 +737,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
   "lea        (%1,%5,2),%1                     \n"
   "movhpd     %%xmm8,(%2,%6)                   \n"
   "lea        (%2,%6,2),%2                     \n"
-  "ja         1b                               \n"
+  "jg         1b                               \n"
   : "+r"(src),    // %0
     "+r"(dst_a),  // %1
     "+r"(dst_b),  // %2
@@ -755,8 +755,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
 
 static void TransposeWx8_C(const uint8* src, int src_stride,
                            uint8* dst, int dst_stride,
-                           int w) {
-  for (int i = 0; i < w; ++i) {
+                           int width) {
+  for (int i = 0; i < width; ++i) {
     dst[0] = src[0 * src_stride];
     dst[1] = src[1 * src_stride];
     dst[2] = src[2 * src_stride];
@@ -888,9 +888,9 @@ void RotatePlane180(const uint8* src, int src_stride,
 static void TransposeUVWx8_C(const uint8* src, int src_stride,
                              uint8* dst_a, int dst_stride_a,
                              uint8* dst_b, int dst_stride_b,
-                             int w) {
+                             int width) {
   int i;
-  for (i = 0; i < w; ++i) {
+  for (i = 0; i < width; ++i) {
     dst_a[0] = src[0 * src_stride + 0];
     dst_b[0] = src[0 * src_stride + 1];
     dst_a[1] = src[1 * src_stride + 0];
@@ -916,10 +916,10 @@ static void TransposeUVWx8_C(const uint8* src, int src_stride,
 static void TransposeUVWxH_C(const uint8* src, int src_stride,
                              uint8* dst_a, int dst_stride_a,
                              uint8* dst_b, int dst_stride_b,
-                             int w, int h) {
+                             int width, int height) {
   int i, j;
-  for (i = 0; i < w * 2; i += 2)
-    for (j = 0; j < h; ++j) {
+  for (i = 0; i < width * 2; i += 2)
+    for (j = 0; j < height; ++j) {
       dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
       dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
     }
diff --git a/source/row_neon.cc b/source/row_neon.cc
index fb4205a79..3ebebc113 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -73,7 +73,7 @@ YUVTORGB
     "vmov.u8    d23, #255                      \n"
     "vst4.u8    {d20, d21, d22, d23}, [%3]!    \n"
     "subs       %4, %4, #8                     \n"
-    "bhi        1b                             \n"
+    "bgt        1b                             \n"
     : "+r"(y_buf),    // %0
       "+r"(u_buf),    // %1
       "+r"(v_buf),    // %2
@@ -106,7 +106,7 @@ YUVTORGB
     "vmov.u8    d19, #255                      \n"
     "vst4.u8    {d19, d20, d21, d22}, [%3]!    \n"
     "subs       %4, %4, #8                     \n"
-    "bhi        1b                             \n"
+    "bgt        1b                             \n"
     : "+r"(y_buf),    // %0
       "+r"(u_buf),    // %1
       "+r"(v_buf),    // %2
@@ -139,7 +139,7 @@ YUVTORGB
     "vmov.u8    d23, #255                      \n"
     "vst4.u8    {d20, d21, d22, d23}, [%3]!    \n"
     "subs       %4, %4, #8                     \n"
-    "bhi        1b                             \n"
+    "bgt        1b                             \n"
     : "+r"(y_buf),    // %0
       "+r"(u_buf),    // %1
       "+r"(v_buf),    // %2
@@ -163,7 +163,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
     "vst1.u8    {q0}, [%1]!                    \n"  // store U
     "vst1.u8    {q1}, [%2]!                    \n"  // Store V
-    "bhi        1b                             \n"
+    "bgt        1b                             \n"
     : "+r"(src_uv),  // %0
       "+r"(dst_u),   // %1
       "+r"(dst_v),   // %2
@@ -183,7 +183,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
     "vldm       %0!,{q0,q1,q2,q3}              \n"  // load 64
     "subs       %2, %2, #64                    \n"  // 64 processed per loop
     "vstm       %1!,{q0,q1,q2,q3}              \n"  // store 64
-    "bhi        1b                             \n"
+    "bgt        1b                             \n"
     : "+r"(src),   // %0
       "+r"(dst),   // %1
       "+r"(count)  // %2  // Output registers
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 3d781fdf0..06a06a52f 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -125,7 +125,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
     "movdqa    %%xmm1,0x10(%1)                 \n"
     "lea       0x20(%1),%1                     \n"
     "sub       $0x8,%2                         \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_y),     // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
@@ -140,14 +140,15 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
 void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
   asm volatile (
     "movdqa    %3,%%xmm5                       \n"
+    "sub       %0,%1                           \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
-    "lea       0x10(%0),%0                     \n"
     "pshufb    %%xmm5,%%xmm0                   \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
     "sub       $0x4,%2                         \n"
-    "ja        1b                              \n"
+    "movdqa    %%xmm0,(%0,%1,1)                \n"
+    "lea       0x10(%0),%0                     \n"
+    "jg        1b                              \n"
+
   : "+r"(src_abgr),  // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
@@ -162,14 +163,14 @@ void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
 void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
   asm volatile (
     "movdqa    %3,%%xmm5                       \n"
+    "sub       %0,%1                           \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
-    "lea       0x10(%0),%0                     \n"
     "pshufb    %%xmm5,%%xmm0                   \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
     "sub       $0x4,%2                         \n"
-    "ja        1b                              \n"
+    "movdqa    %%xmm0,(%0,%1,1)                \n"
+    "lea       0x10(%0),%0                     \n"
+    "jg        1b                              \n"
   : "+r"(src_bgra),  // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
@@ -206,10 +207,10 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
     "pshufb    %%xmm4,%%xmm3                   \n"
     "movdqa    %%xmm1,0x10(%1)                 \n"
     "por       %%xmm5,%%xmm3                   \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm3,0x30(%1)                 \n"
     "lea       0x40(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_rgb24),  // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
@@ -246,10 +247,10 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
     "pshufb    %%xmm4,%%xmm3                   \n"
     "movdqa    %%xmm1,0x10(%1)                 \n"
     "por       %%xmm5,%%xmm3                   \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm3,0x30(%1)                 \n"
     "lea       0x40(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_raw),   // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
@@ -298,7 +299,7 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "movdqa    %%xmm2,0x10(%1,%0,2)            \n"
     "lea       0x10(%0),%0                     \n"
     "sub       $0x8,%2                         \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(pix)   // %2
@@ -350,7 +351,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "movdqa    %%xmm2,0x10(%1,%0,2)            \n"
     "lea       0x10(%0),%0                     \n"
     "sub       $0x8,%2                         \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(pix)   // %2
@@ -389,7 +390,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "movdqa    %%xmm1,0x10(%1,%0,2)            \n"
     "lea       0x10(%0),%0                     \n"
     "sub       $0x8,%2                         \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(pix)   // %2
@@ -429,7 +430,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
     "movdqa    %%xmm2,0x20(%1)                 \n"
     "lea       0x30(%1),%1                     \n"
     "sub       $0x10,%2                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(pix)   // %2
@@ -469,7 +470,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
     "movdqa    %%xmm2,0x20(%1)                 \n"
     "lea       0x30(%1),%1                     \n"
     "sub       $0x10,%2                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(pix)   // %2
@@ -508,7 +509,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
     "movq      %%xmm0,(%1)                     \n"
     "lea       0x8(%1),%1                      \n"
     "sub       $0x4,%2                         \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(pix)   // %2
@@ -551,7 +552,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
     "movq      %%xmm0,(%1)                     \n"
     "lea       0x8(%1),%1                      \n"
     "sub       $0x4,%2                         \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(pix)   // %2
@@ -582,7 +583,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
     "movq      %%xmm0,(%1)                     \n"
     "lea       0x8(%1),%1                      \n"
     "sub       $0x4,%2                         \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(pix)   // %2
@@ -614,10 +615,10 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1)                     \n"
     "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
@@ -650,10 +651,10 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1)                     \n"
     "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
@@ -718,11 +719,11 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%3                        \n"
     "movlps    %%xmm0,(%1)                     \n"
     "movhps    %%xmm0,(%1,%2,1)                \n"
     "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_argb0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
@@ -786,11 +787,11 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%3                        \n"
     "movlps    %%xmm0,(%1)                     \n"
     "movhps    %%xmm0,(%1,%2,1)                \n"
     "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_argb0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
@@ -823,10 +824,10 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1)                     \n"
     "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_bgra),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
@@ -859,10 +860,10 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1)                     \n"
     "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_bgra),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
@@ -922,11 +923,11 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%3                        \n"
     "movlps    %%xmm0,(%1)                     \n"
     "movhps    %%xmm0,(%1,%2,1)                \n"
     "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_bgra0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
@@ -990,11 +991,11 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%3                        \n"
     "movlps    %%xmm0,(%1)                     \n"
     "movhps    %%xmm0,(%1,%2,1)                \n"
     "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_bgra0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
@@ -1027,10 +1028,10 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1)                     \n"
     "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_abgr),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
@@ -1063,10 +1064,10 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1)                     \n"
     "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_abgr),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
@@ -1126,11 +1127,11 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%3                        \n"
     "movlps    %%xmm0,(%1)                     \n"
     "movhps    %%xmm0,(%1,%2,1)                \n"
     "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_abgr0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
@@ -1194,11 +1195,11 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%3                        \n"
     "movlps    %%xmm0,(%1)                     \n"
     "movhps    %%xmm0,(%1,%2,1)                \n"
     "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_abgr0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
@@ -1305,7 +1306,7 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm1,0x10(%3)                 \n"
     "lea       0x20(%3),%3                     \n"
     "sub       $0x8,%4                         \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(y_buf),    // %0
     "+r"(u_buf),    // %1
     "+r"(v_buf),    // %2
@@ -1340,7 +1341,7 @@ void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm0,0x10(%3)                 \n"
     "lea       0x20(%3),%3                     \n"
     "sub       $0x8,%4                         \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(y_buf),    // %0
     "+r"(u_buf),    // %1
     "+r"(v_buf),    // %2
@@ -1374,7 +1375,7 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm1,0x10(%3)                 \n"
     "lea       0x20(%3),%3                     \n"
     "sub       $0x8,%4                         \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(y_buf),    // %0
     "+r"(u_buf),    // %1
     "+r"(v_buf),    // %2
@@ -1427,10 +1428,10 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
     "punpcklbw %%xmm1,%%xmm0                   \n"
     "punpcklbw %%xmm5,%%xmm2                   \n"
     "punpcklwd %%xmm2,%%xmm0                   \n"
+    "sub       $0x4,%4                         \n"
     "movdqa    %%xmm0,(%3)                     \n"
     "lea       0x10(%3),%3                     \n"
-    "sub       $0x4,%4                         \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(y_buf),    // %0
     "+r"(u_buf),    // %1
     "+r"(v_buf),    // %2
@@ -1479,7 +1480,7 @@ void YToARGBRow_SSE2(const uint8* y_buf,
     "lea       32(%1),%1                       \n"
 
     "sub       $0x8,%2                         \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(y_buf),    // %0
     "+r"(rgb_buf),  // %1
     "+rm"(width)    // %2
@@ -1509,7 +1510,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
     "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1)                     \n"
     "lea       0x10(%1),%1                     \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(temp_width)  // %2
@@ -1539,7 +1540,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
     "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1)                     \n"
     "lea       0x10(%1),%1                     \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(temp_width)  // %2
@@ -1572,7 +1573,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
     "movlpd    %%xmm0,(%1)                     \n"
     "movhpd    %%xmm0,(%1,%2)                  \n"
     "lea       8(%1),%1                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src),      // %0
     "+r"(dst_u),    // %1
     "+r"(dst_v),    // %2
@@ -1608,7 +1609,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
     "movdqa     %%xmm2,(%1,%2)                   \n"
     "lea        0x10(%1),%1                      \n"
     "sub        $0x10,%3                         \n"
-    "ja         1b                               \n"
+    "jg         1b                               \n"
   : "+r"(src_uv),     // %0
     "+r"(dst_u),      // %1
     "+r"(dst_v),      // %2
@@ -1633,7 +1634,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
     "movdqa    %%xmm1,0x10(%0,%1)              \n"
     "lea       0x20(%0),%0                     \n"
     "sub       $0x20,%2                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src),   // %0
     "+r"(dst),   // %1
     "+r"(count)  // %2
@@ -1676,7 +1677,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
     "movdqa    %%xmm0,(%1)                     \n"
     "lea       0x10(%1),%1                     \n"
     "sub       $0x10,%2                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_yuy2),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
@@ -1714,7 +1715,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
     "movq      %%xmm1,(%1,%2)                  \n"
     "lea       0x8(%1),%1                      \n"
     "sub       $0x10,%3                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_yuy2),    // %0
     "+r"(dst_u),       // %1
     "+r"(dst_y),       // %2
@@ -1739,10 +1740,10 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
     "pand      %%xmm5,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1)                     \n"
     "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_yuy2),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
@@ -1782,7 +1783,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
     "movq      %%xmm1,(%1,%2)                  \n"
     "lea       0x8(%1),%1                      \n"
     "sub       $0x10,%3                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_yuy2),    // %0
     "+r"(dst_u),       // %1
     "+r"(dst_y),       // %2
@@ -1804,10 +1805,10 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
     "psrlw     $0x8,%%xmm0                     \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1)                     \n"
     "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_uyvy),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
@@ -1845,7 +1846,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
     "movq      %%xmm1,(%1,%2)                  \n"
     "lea       0x8(%1),%1                      \n"
     "sub       $0x10,%3                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_uyvy),    // %0
     "+r"(dst_u),       // %1
     "+r"(dst_y),       // %2
@@ -1868,10 +1869,10 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
     "psrlw     $0x8,%%xmm0                     \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1)                     \n"
     "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_uyvy),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
@@ -1909,7 +1910,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
     "movq      %%xmm1,(%1,%2)                  \n"
     "lea       0x8(%1),%1                      \n"
     "sub       $0x10,%3                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_uyvy),    // %0
     "+r"(dst_u),       // %1
     "+r"(dst_y),       // %2
diff --git a/source/row_win.cc b/source/row_win.cc
index 5bf422069..c538562ff 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -122,7 +122,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
     movdqa     [edx + 16], xmm1
     lea        edx, [edx + 32]
     sub        ecx, 8
-    ja         convertloop
+    jg         convertloop
     ret
   }
 }
@@ -134,16 +134,16 @@ __asm {
     mov       edx, [esp + 8]   // dst_argb
     mov       ecx, [esp + 12]  // pix
     movdqa    xmm5, kShuffleMaskABGRToARGB
+    sub       edx, eax
 
     align      16
  convertloop:
     movdqa    xmm0, [eax]
-    lea       eax, [eax + 16]
     pshufb    xmm0, xmm5
-    movdqa    [edx], xmm0
-    lea       edx, [edx + 16]
     sub       ecx, 4
-    ja        convertloop
+    movdqa    [eax + edx], xmm0
+    lea       eax, [eax + 16]
+    jg        convertloop
     ret
   }
 }
@@ -155,16 +155,16 @@ __asm {
     mov       edx, [esp + 8]   // dst_argb
     mov       ecx, [esp + 12]  // pix
     movdqa    xmm5, kShuffleMaskBGRAToARGB
+    sub       edx, eax
 
     align      16
  convertloop:
     movdqa    xmm0, [eax]
-    lea       eax, [eax + 16]
     pshufb    xmm0, xmm5
-    movdqa    [edx], xmm0
-    lea       edx, [edx + 16]
     sub       ecx, 4
-    ja        convertloop
+    movdqa    [eax + edx], xmm0
+    lea       eax, [eax + 16]
+    jg        convertloop
     ret
   }
 }
@@ -200,10 +200,10 @@ __asm {
     pshufb    xmm3, xmm4
     movdqa    [edx + 16], xmm1
     por       xmm3, xmm5
+    sub       ecx, 16
     movdqa    [edx + 48], xmm3
     lea       edx, [edx + 64]
-    sub       ecx, 16
-    ja        convertloop
+    jg        convertloop
     ret
   }
 }
@@ -240,10 +240,10 @@ __asm {
     pshufb    xmm3, xmm4
     movdqa    [edx + 16], xmm1
     por       xmm3, xmm5
+    sub       ecx, 16
     movdqa    [edx + 48], xmm3
     lea       edx, [edx + 64]
-    sub       ecx, 16
-    ja        convertloop
+    jg        convertloop
     ret
   }
 }
@@ -300,7 +300,7 @@ __asm {
     movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
     lea       eax, [eax + 16]
     sub       ecx, 8
-    ja        convertloop
+    jg        convertloop
     ret
   }
 }
@@ -354,7 +354,7 @@ __asm {
     movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
     lea       eax, [eax + 16]
     sub       ecx, 8
-    ja        convertloop
+    jg        convertloop
     ret
   }
 }
@@ -394,7 +394,7 @@ __asm {
     movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
     lea       eax, [eax + 16]
     sub       ecx, 8
-    ja        convertloop
+    jg        convertloop
     ret
   }
 }
@@ -433,7 +433,7 @@ __asm {
     movdqa    [edx + 32], xmm2   // store 2
     lea       edx, [edx + 48]
     sub       ecx, 16
-    ja        convertloop
+    jg        convertloop
     ret
   }
 }
@@ -472,7 +472,7 @@ __asm {
     movdqa    [edx + 32], xmm2   // store 2
     lea       edx, [edx + 48]
     sub       ecx, 16
-    ja        convertloop
+    jg        convertloop
     ret
   }
 }
@@ -510,7 +510,7 @@ __asm {
     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
     lea       edx, [edx + 8]
     sub       ecx, 4
-    ja        convertloop
+    jg        convertloop
     ret
   }
 }
@@ -553,7 +553,7 @@ __asm {
     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
     lea       edx, [edx + 8]
     sub       ecx, 4
-    ja        convertloop
+    jg        convertloop
     ret
   }
 }
@@ -583,7 +583,7 @@ __asm {
     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
     lea       edx, [edx + 8]
     sub       ecx, 4
-    ja        convertloop
+    jg        convertloop
     ret
   }
 }
@@ -618,7 +618,7 @@ __asm {
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
     ret
   }
 }
@@ -652,7 +652,7 @@ __asm {
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
     ret
   }
 }
@@ -686,7 +686,7 @@ __asm {
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
     ret
   }
 }
@@ -720,7 +720,7 @@ __asm {
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
     ret
   }
 }
@@ -754,7 +754,7 @@ __asm {
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
     ret
   }
 }
@@ -785,10 +785,10 @@ __asm {
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
     paddb      xmm0, xmm5
+    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
-    sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
     ret
   }
 }
@@ -847,11 +847,12 @@ __asm {
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
+    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
-    sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
+
     pop        edi
     pop        esi
     ret
@@ -916,11 +917,12 @@ __asm {
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
+    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
-    sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
+
     pop        edi
     pop        esi
     ret
@@ -981,11 +983,12 @@ __asm {
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
+    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
-    sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
+
     pop        edi
     pop        esi
     ret
@@ -1050,11 +1053,12 @@ __asm {
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
+    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
-    sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
+
     pop        edi
     pop        esi
     ret
@@ -1115,11 +1119,12 @@ __asm {
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
+    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
-    sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
+
     pop        edi
     pop        esi
     ret
@@ -1184,11 +1189,12 @@ __asm {
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
+    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
-    sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
+
     pop        edi
     pop        esi
     ret
@@ -1293,9 +1299,8 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf,
     movdqa     [edx], xmm0
     movdqa     [edx + 16], xmm1
     lea        edx,  [edx + 32]
-
     sub        ecx, 8
-    ja         convertloop
+    jg         convertloop
 
     pop        edi
     pop        esi
@@ -1334,9 +1339,8 @@ void I420ToBGRARow_SSSE3(const uint8* y_buf,
     movdqa     [edx], xmm5
     movdqa     [edx + 16], xmm0
     lea        edx,  [edx + 32]
-
     sub        ecx, 8
-    ja         convertloop
+    jg         convertloop
 
     pop        edi
     pop        esi
@@ -1375,9 +1379,8 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf,
     movdqa     [edx], xmm2
     movdqa     [edx + 16], xmm1
     lea        edx,  [edx + 32]
-
     sub        ecx, 8
-    ja         convertloop
+    jg         convertloop
 
     pop        edi
     pop        esi
@@ -1441,9 +1444,8 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
     punpcklwd  xmm0, xmm2           // BGRA 4 pixels
     movdqa     [edx], xmm0
     lea        edx,  [edx + 16]
-
     sub        ecx, 4
-    ja         convertloop
+    jg         convertloop
 
     pop        edi
     pop        esi
@@ -1490,9 +1492,8 @@ void YToARGBRow_SSE2(const uint8* y_buf,
     movdqa     [edx], xmm0
     movdqa     [edx + 16], xmm1
     lea        edx,  [edx + 32]
-
     sub        ecx, 8
-    ja         convertloop
+    jg         convertloop
 
     ret
   }
@@ -1523,7 +1524,7 @@ __asm {
     sub       ecx, 16
     movdqa    [edx], xmm0
     lea       edx, [edx + 16]
-    ja        convertloop
+    jg        convertloop
     ret
   }
 }
@@ -1553,7 +1554,7 @@ __asm {
     sub       ecx, 16
     movdqu    [edx], xmm0
     lea       edx, [edx + 16]
-    ja        convertloop
+    jg        convertloop
     ret
   }
 }
@@ -1587,7 +1588,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
     movlpd    qword ptr [edx], xmm0
     movhpd    qword ptr [edx + edi], xmm0
     lea       edx, [edx + 8]
-    ja        convertloop
+    jg        convertloop
 
     pop       edi
     ret
@@ -1625,7 +1626,8 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
     movdqa     [edx + edi], xmm2
     lea        edx, [edx + 16]
     sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
+
     pop        edi
     ret
   }
@@ -1650,7 +1652,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
     movdqa     [eax + edx + 16], xmm1
     lea        eax, [eax + 32]
     sub        ecx, 32
-    ja         convertloop
+    jg         convertloop
     ret
   }
 }
@@ -1693,10 +1695,10 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
     pand       xmm0, xmm5   // even bytes are Y
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
+    sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
-    sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
     ret
   }
 }
@@ -1737,7 +1739,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
     movq       qword ptr [edx + edi], xmm1
     lea        edx, [edx + 8]
     sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
 
     pop        edi
     pop        esi
@@ -1763,10 +1765,10 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
     pand       xmm0, xmm5   // even bytes are Y
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
+    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
-    sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
     ret
   }
 }
@@ -1807,7 +1809,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
     movq       qword ptr [edx + edi], xmm1
     lea        edx, [edx + 8]
     sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
 
     pop        edi
     pop        esi
@@ -1831,10 +1833,10 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
     psrlw      xmm0, 8    // odd bytes are Y
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
+    sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
-    sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
     ret
   }
 }
@@ -1875,7 +1877,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
     movq       qword ptr [edx + edi], xmm1
     lea        edx, [edx + 8]
     sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
 
     pop        edi
     pop        esi
@@ -1899,10 +1901,10 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
     psrlw      xmm0, 8    // odd bytes are Y
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
+    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
-    sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
     ret
   }
 }
@@ -1943,7 +1945,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
     movq       qword ptr [edx + edi], xmm1
     lea        edx, [edx + 8]
     sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
 
     pop        edi
     pop        esi
diff --git a/source/scale.cc b/source/scale.cc
index 0764ab751..f3d6d771c 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -64,7 +64,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
     "vld2.u8    {q0,q1}, [%0]!                 \n"  // load even pixels into q0, odd into q1
     "vst1.u8    {q0}, [%1]!                    \n"  // store even pixels
     "subs       %2, %2, #16                    \n"  // 16 processed per loop
-    "bhi        1b                             \n"
+    "bgt        1b                             \n"
     : "+r"(src_ptr),          // %0
       "+r"(dst),              // %1
       "+r"(dst_width)         // %2
@@ -88,7 +88,7 @@ void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
     "vrshrn.u16 d1, q1, #2                     \n"
     "vst1.u8    {q0}, [%2]!                    \n"
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    "bhi        1b                             \n"
+    "bgt        1b                             \n"
     : "+r"(src_ptr),          // %0
       "+r"(src_stride),       // %1
       "+r"(dst),              // %2
@@ -109,7 +109,7 @@ static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */,
     "vst1.u32   {d0[1]}, [%1]!                 \n"
 
     "subs       %2, #4                         \n"
-    "bhi        1b                             \n"
+    "bgt        1b                             \n"
     : "+r"(src_ptr),          // %0
       "+r"(dst_ptr),          // %1
       "+r"(dst_width)         // %2
@@ -143,7 +143,7 @@ static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
     "vst1.u32   {d0[0]}, [%1]!                 \n"
 
     "subs       %2, #4                         \n"
-    "bhi        1b                             \n"
+    "bgt        1b                             \n"
 
     : "+r"(src_ptr),          // %0
       "+r"(dst_ptr),          // %1
@@ -165,7 +165,7 @@ static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */,
     "vmov         d2, d3                       \n" // order needs to be d0, d1, d2
     "vst3.u8      {d0, d1, d2}, [%1]!          \n"
     "subs         %2, #24                      \n"
-    "bhi          1b                           \n"
+    "bgt          1b                           \n"
     : "+r"(src_ptr),          // %0
       "+r"(dst_ptr),          // %1
       "+r"(dst_width)         // %2
@@ -219,7 +219,7 @@ static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride,
     "vst3.u8      {d0, d1, d2}, [%1]!          \n"
 
     "subs         %2, #24                      \n"
-    "bhi          1b                           \n"
+    "bgt          1b                           \n"
     : "+r"(src_ptr),          // %0
       "+r"(dst_ptr),          // %1
       "+r"(dst_width),        // %2
@@ -258,7 +258,7 @@ static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,
     "vst3.u8      {d0, d1, d2}, [%1]!          \n"
 
     "subs         %2, #24                      \n"
-    "bhi          1b                           \n"
+    "bgt          1b                           \n"
     : "+r"(src_ptr),          // %0
       "+r"(dst_ptr),          // %1
       "+r"(dst_width),        // %2
@@ -292,7 +292,7 @@ static void ScaleRowDown38_NEON(const uint8* src_ptr, int,
     "vst1.u8      {d4}, [%1]!                  \n"
     "vst1.u32     {d5[0]}, [%1]!               \n"
     "subs         %2, #12                      \n"
-    "bhi          1b                           \n"
+    "bgt          1b                           \n"
     : "+r"(src_ptr),          // %0
       "+r"(dst_ptr),          // %1
       "+r"(dst_width)         // %2
@@ -397,7 +397,7 @@ static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,
     "vst1.u8      {d3}, [%1]!                  \n"
     "vst1.u32     {d4[0]}, [%1]!               \n"
     "subs         %2, #12                      \n"
-    "bhi          1b                           \n"
+    "bgt          1b                           \n"
     : "+r"(src_ptr),          // %0
       "+r"(dst_ptr),          // %1
       "+r"(dst_width),        // %2
@@ -492,7 +492,7 @@ static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
     "vst1.u8      {d3}, [%1]!                  \n"
     "vst1.u32     {d4[0]}, [%1]!               \n"
     "subs         %2, #12                      \n"
-    "bhi          1b                           \n"
+    "bgt          1b                           \n"
     : "+r"(src_ptr),          // %0
       "+r"(dst_ptr),          // %1
       "+r"(dst_width),        // %2
@@ -529,14 +529,14 @@ static void ScaleFilterRows_NEON(uint8* dst_ptr,
     "vrshrn.u16   d0, q13, #8                  \n"
     "vrshrn.u16   d1, q14, #8                  \n"
     "vst1.u8      {q0}, [%0]!                  \n"
-    "bhi          1b                           \n"
+    "bgt          1b                           \n"
     "b            4f                           \n"
 
     "2:                                        \n"
     "vld1.u8      {q0}, [%1]!                  \n"
     "subs         %3, #16                      \n"
     "vst1.u8      {q0}, [%0]!                  \n"
-    "bhi          2b                           \n"
+    "bgt          2b                           \n"
     "b            4f                           \n"
 
     "3:                                        \n"
@@ -545,7 +545,7 @@ static void ScaleFilterRows_NEON(uint8* dst_ptr,
     "subs         %3, #16                      \n"
     "vrhadd.u8    q0, q1                       \n"
     "vst1.u8      {q0}, [%0]!                  \n"
-    "bhi          3b                           \n"
+    "bgt          3b                           \n"
     "4:                                        \n"
     "vst1.u8      {d1[7]}, [%0]                \n"
     : "+r"(dst_ptr),          // %0
@@ -697,7 +697,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
     sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
-    ja         wloop
+    jg         wloop
 
     ret
   }
@@ -739,7 +739,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
     sub        ecx, 16
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
-    ja         wloop
+    jg         wloop
 
     pop        esi
     ret
@@ -772,7 +772,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
     sub        ecx, 8
     movq       qword ptr [edx], xmm0
     lea        edx, [edx + 8]
-    ja         wloop
+    jg         wloop
 
     ret
   }
@@ -831,7 +831,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
     sub        ecx, 8
     movq       qword ptr [edx], xmm0
     lea        edx, [edx + 8]
-    ja         wloop
+    jg         wloop
 
     pop        edi
     pop        esi
@@ -866,7 +866,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
     sub        ecx, 4
     movd       dword ptr [edx], xmm0
     lea        edx, [edx + 4]
-    ja         wloop
+    jg         wloop
 
     ret
   }
@@ -936,7 +936,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
     sub        ecx, 4
     movd       dword ptr [edx], xmm0
     lea        edx, [edx + 4]
-    ja         wloop
+    jg         wloop
 
     pop        ebp
     pop        edi
@@ -979,7 +979,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
     movq       qword ptr [edx + 16], xmm2
     lea        edx, [edx + 24]
     sub        ecx, 24
-    ja         wloop
+    jg         wloop
 
     ret
   }
@@ -1050,7 +1050,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
     sub        ecx, 24
     movq       qword ptr [edx + 16], xmm0
     lea        edx, [edx + 24]
-    ja         wloop
+    jg         wloop
 
     pop        esi
     ret
@@ -1111,7 +1111,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
     sub        ecx, 24
     movq       qword ptr [edx + 16], xmm0
     lea        edx, [edx+24]
-    ja         wloop
+    jg         wloop
 
     pop        esi
     ret
@@ -1147,7 +1147,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
     movhlps    xmm1, xmm0
     movd       [edx + 8], xmm1
     lea        edx, [edx + 12]
-    ja         xloop
+    jg         xloop
 
     ret
   }
@@ -1212,7 +1212,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
     pextrw     ebx, xmm2, 2
     mov        [edx + 4], bx
     lea        edx, [edx + 6]
-    ja         xloop
+    jg         xloop
 
     pop        ebx
     pop        esi
@@ -1258,7 +1258,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
     pextrw     ebx, xmm0, 2
     mov        [edx + 4], bx
     lea        edx, [edx + 6]
-    ja         xloop
+    jg         xloop
 
     pop        ebx
     pop        esi
@@ -1310,14 +1310,14 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
     paddusw    xmm0, xmm2        // sum 16 words
     paddusw    xmm1, xmm3
     sub        ebp, 1
-    ja         yloop
+    jg         yloop
   ydone:
     movdqa     [edi], xmm0
     movdqa     [edi + 16], xmm1
     lea        edi, [edi + 32]
 
     sub        ecx, 16
-    ja         xloop
+    jg         xloop
 
     pop        ebp
     pop        ebx
@@ -1379,7 +1379,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     sub        ecx, 16
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
-    ja         xloop
+    jg         xloop
 
     mov        al, [esi + edi - 1]
     mov        [esi + edi], al
@@ -1393,7 +1393,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     sub        ecx, 16
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
-    ja         xloop1
+    jg         xloop1
 
     mov        al, [esi + edi - 1]
     mov        [esi + edi], al
@@ -1408,7 +1408,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     sub        ecx, 16
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
-    ja         xloop2
+    jg         xloop2
 
     mov        al, [esi + edi - 1]
     mov        [esi + edi], al
@@ -1460,7 +1460,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     sub        ecx, 16
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
-    ja         xloop
+    jg         xloop
 
     mov        al, [esi + edi - 1]
     mov        [esi + edi], al
@@ -1474,7 +1474,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     sub        ecx, 16
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
-    ja         xloop1
+    jg         xloop1
 
     mov        al, [esi + edi - 1]
     mov        [esi + edi], al
@@ -1489,7 +1489,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     sub        ecx, 16
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
-    ja         xloop2
+    jg         xloop2
 
     mov        al, [esi + edi - 1]
     mov        [esi + edi], al
@@ -1542,7 +1542,7 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     sub        ecx, 24
     movq       qword ptr [edx+16], xmm0
     lea        edx, [edx+24]
-    ja         wloop
+    jg         wloop
     ret
   }
 }
@@ -1568,7 +1568,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
   "movdqa     %%xmm0,(%1)                      \n"
   "lea        0x10(%1),%1                      \n"
   "sub        $0x10,%2                         \n"
-  "ja         1b                               \n"
+  "jg         1b                               \n"
   : "+r"(src_ptr),    // %0
     "+r"(dst_ptr),    // %1
     "+r"(dst_width)   // %2
@@ -1602,7 +1602,7 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
   "movdqa     %%xmm0,(%1)                      \n"
   "lea        0x10(%1),%1                      \n"
   "sub        $0x10,%2                         \n"
-  "ja         1b                               \n"
+  "jg         1b                               \n"
   : "+r"(src_ptr),    // %0
     "+r"(dst_ptr),    // %1
     "+r"(dst_width)   // %2
@@ -1628,7 +1628,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
   "movq       %%xmm0,(%1)                      \n"
   "lea        0x8(%1),%1                       \n"
   "sub        $0x8,%2                          \n"
-  "ja         1b                               \n"
+  "jg         1b                               \n"
   : "+r"(src_ptr),    // %0
     "+r"(dst_ptr),    // %1
     "+r"(dst_width)   // %2
@@ -1677,7 +1677,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
   "movq       %%xmm0,(%1)                      \n"
   "lea        0x8(%1),%1                       \n"
   "sub        $0x8,%2                          \n"
-  "ja         1b                               \n"
+  "jg         1b                               \n"
   : "+r"(src_ptr),     // %0
     "+r"(dst_ptr),     // %1
     "+r"(dst_width),   // %2
@@ -1708,7 +1708,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
     "movd      %%xmm0,(%1)                     \n"
     "lea       0x4(%1),%1                      \n"
     "sub       $0x4,%2                         \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_ptr),    // %0
     "+r"(dst_ptr),    // %1
     "+r"(dst_width)   // %2
@@ -1744,14 +1744,14 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
     "paddusw   %%xmm2,%%xmm0                   \n"
     "paddusw   %%xmm3,%%xmm1                   \n"
     "sub       $0x1,%2                         \n"
-    "ja        2b                              \n"
+    "jg        2b                              \n"
   "3:                                          \n"
     "movdqa    %%xmm0,(%1)                     \n"
     "movdqa    %%xmm1,0x10(%1)                 \n"
     "lea       0x10(%3),%0                     \n"
     "lea       0x20(%1),%1                     \n"
     "sub       $0x10,%4                        \n"
-    "ja        1b                              \n"
+    "jg        1b                              \n"
   : "+r"(src_ptr),     // %0
     "+r"(dst_ptr),     // %1
     "+r"(tmp_height),  // %2
@@ -1823,7 +1823,7 @@ extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
     "sub    $0x4,%ecx                          \n"
     "movd   %xmm0,(%edi)                       \n"
     "lea    0x4(%edi),%edi                     \n"
-    "ja     1b                                 \n"
+    "jg     1b                                 \n"
     "popa                                      \n"
     "ret                                       \n"
 );
@@ -1857,7 +1857,7 @@ extern "C" void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
     "movq   %xmm2,0x10(%edi)                   \n"
     "lea    0x18(%edi),%edi                    \n"
     "sub    $0x18,%ecx                         \n"
-    "ja     1b                                 \n"
+    "jg     1b                                 \n"
     "popa                                      \n"
     "ret                                       \n"
 );
@@ -1910,7 +1910,7 @@ extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
     "sub    $0x18,%ecx                         \n"
     "movq   %xmm0,0x10(%edi)                   \n"
     "lea    0x18(%edi),%edi                    \n"
-    "ja     1b                                 \n"
+    "jg     1b                                 \n"
 
     "popa                                      \n"
     "ret                                       \n"
@@ -1967,7 +1967,7 @@ extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
     "sub    $0x18,%ecx                         \n"
     "movq   %xmm0,0x10(%edi)                   \n"
     "lea    0x18(%edi),%edi                    \n"
-    "ja     1b                                 \n"
+    "jg     1b                                 \n"
     "popa                                      \n"
     "ret                                       \n"
 );
@@ -1997,7 +1997,7 @@ extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
     "sub    $0xc,%ecx                          \n"
     "movd   %xmm1,0x8(%edi)                    \n"
     "lea    0xc(%edi),%edi                     \n"
-    "ja     1b                                 \n"
+    "jg     1b                                 \n"
     "popa                                      \n"
     "ret                                       \n"
 );
@@ -2054,7 +2054,7 @@ extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
     "mov    %ax,0x4(%edi)                      \n"
     "lea    0x6(%edi),%edi                     \n"
     "sub    $0x6,%ecx                          \n"
-    "ja     1b                                 \n"
+    "jg     1b                                 \n"
     "popa                                      \n"
     "ret                                       \n"
 );
@@ -2091,7 +2091,7 @@ extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
     "mov    %ax,0x4(%edi)                      \n"
     "lea    0x6(%edi),%edi                     \n"
     "sub    $0x6,%ecx                          \n"
-    "ja     1b                                 \n"
+    "jg     1b                                 \n"
     "popa                                      \n"
     "ret                                       \n"
 );
@@ -2147,7 +2147,7 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
     "sub    $0x10,%ecx                         \n"
     "movdqa %xmm0,(%esi,%edi,1)                \n"
     "lea    0x10(%esi),%esi                    \n"
-    "ja     1b                                 \n"
+    "jg     1b                                 \n"
 
     "mov    -0x1(%esi,%edi,1),%al              \n"
     "mov    %al,(%esi,%edi,1)                  \n"
@@ -2160,7 +2160,7 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
     "sub    $0x10,%ecx                         \n"
     "movdqa %xmm0,(%esi,%edi,1)                \n"
     "lea    0x10(%esi),%esi                    \n"
-    "ja     2b                                 \n"
+    "jg     2b                                 \n"
 
     "mov    -0x1(%esi,%edi,1),%al              \n"
     "mov    %al,(%esi,%edi,1)                  \n"
@@ -2174,7 +2174,7 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
     "sub    $0x10,%ecx                         \n"
     "movdqa %xmm0,(%esi,%edi,1)                \n"
     "lea    0x10(%esi),%esi                    \n"
-    "ja     3b                                 \n"
+    "jg     3b                                 \n"
 
     "mov    -0x1(%esi,%edi,1),%al              \n"
     "mov    %al,(%esi,%edi,1)                  \n"
@@ -2224,7 +2224,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
     "sub    $0x10,%ecx                         \n"
     "movdqa %xmm0,(%esi,%edi,1)                \n"
     "lea    0x10(%esi),%esi                    \n"
-    "ja     1b                                 \n"
+    "jg     1b                                 \n"
 
     "mov    -0x1(%esi,%edi,1),%al              \n"
     "mov    %al,(%esi,%edi,1)                  \n"
@@ -2237,7 +2237,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
     "sub    $0x10,%ecx                         \n"
     "movdqa %xmm0,(%esi,%edi,1)                \n"
     "lea    0x10(%esi),%esi                    \n"
-    "ja     2b                                 \n"
+    "jg     2b                                 \n"
 
     "mov    -0x1(%esi,%edi,1),%al              \n"
     "mov    %al,(%esi,%edi,1)                  \n"
@@ -2251,7 +2251,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
     "sub    $0x10,%ecx                         \n"
     "movdqa %xmm0,(%esi,%edi,1)                \n"
     "lea    0x10(%esi),%esi                    \n"
-    "ja     3b                                 \n"
+    "jg     3b                                 \n"
 
     "mov    -0x1(%esi,%edi,1),%al              \n"
     "mov    %al,(%esi,%edi,1)                  \n"
@@ -2310,7 +2310,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
   "movd       %%xmm0,(%1)                      \n"
   "lea        0x4(%1),%1                       \n"
   "sub        $0x4,%2                          \n"
-  "ja         1b                               \n"
+  "jg         1b                               \n"
   : "+r"(src_ptr),     // %0
     "+r"(dst_ptr),     // %1
     "+r"(dst_width)    // %2
@@ -2340,7 +2340,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
   "movq       %%xmm2,0x10(%1)                  \n"
   "lea        0x18(%1),%1                      \n"
   "sub        $0x18,%2                         \n"
-  "ja         1b                               \n"
+  "jg         1b                               \n"
   : "+r"(src_ptr),     // %0
     "+r"(dst_ptr),     // %1
     "+r"(dst_width)    // %2
@@ -2392,7 +2392,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
   "movq       %%xmm0,0x10(%1)                  \n"
   "lea        0x18(%1),%1                      \n"
   "sub        $0x18,%2                         \n"
-  "ja         1b                               \n"
+  "jg         1b                               \n"
   : "+r"(src_ptr),     // %0
     "+r"(dst_ptr),     // %1
     "+r"(dst_width)    // %2
@@ -2452,7 +2452,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
   "movq       %%xmm0,0x10(%1)                  \n"
   "lea        0x18(%1),%1                      \n"
   "sub        $0x18,%2                         \n"
-  "ja         1b                               \n"
+  "jg         1b                               \n"
   : "+r"(src_ptr),     // %0
     "+r"(dst_ptr),     // %1
     "+r"(dst_width)    // %2
@@ -2486,7 +2486,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
   "movd       %%xmm1,0x8(%1)                   \n"
   "lea        0xc(%1),%1                       \n"
   "sub        $0xc,%2                          \n"
-  "ja         1b                               \n"
+  "jg         1b                               \n"
   : "+r"(src_ptr),     // %0
     "+r"(dst_ptr),     // %1
     "+r"(dst_width)    // %2
@@ -2541,7 +2541,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
   "mov        %%ax,0x4(%1)                     \n"
   "lea        0x6(%1),%1                       \n"
   "sub        $0x6,%2                          \n"
-  "ja         1b                               \n"
+  "jg         1b                               \n"
   : "+r"(src_ptr),     // %0
     "+r"(dst_ptr),     // %1
     "+r"(dst_width)    // %2
@@ -2578,7 +2578,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
   "mov        %%ax,0x4(%1)                     \n"
   "lea        0x6(%1),%1                       \n"
   "sub        $0x6,%2                          \n"
-  "ja         1b                               \n"
+  "jg         1b                               \n"
   : "+r"(src_ptr),     // %0
     "+r"(dst_ptr),     // %1
     "+r"(dst_width)    // %2
@@ -2604,7 +2604,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
       "movdqa     %%xmm0,(%0)                  \n"
       "lea        0x10(%0),%0                  \n"
       "sub        $0x10,%2                     \n"
-      "ja         1b                           \n"
+      "jg         1b                           \n"
       "mov        -0x1(%0),%%al                \n"
       "mov        %%al,(%0)                    \n"
       : "+r"(dst_ptr),     // %0
@@ -2624,7 +2624,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
       "movdqa     %%xmm0,(%0)                  \n"
       "lea        0x10(%0),%0                  \n"
       "sub        $0x10,%2                     \n"
-      "ja         1b                           \n"
+      "jg         1b                           \n"
       "mov        -0x1(%0),%%al                \n"
       "mov        %%al,(%0)                    \n"
       : "+r"(dst_ptr),     // %0
@@ -2668,7 +2668,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
       "movdqa     %%xmm0,(%0)                  \n"
       "lea        0x10(%0),%0                  \n"
       "sub        $0x10,%2                     \n"
-      "ja         1b                           \n"
+      "jg         1b                           \n"
       "mov        -0x1(%0),%%al                \n"
       "mov        %%al,(%0)                    \n"
       : "+r"(dst_ptr),     // %0
@@ -2695,7 +2695,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
       "movdqa     %%xmm0,(%0)                  \n"
       "lea        0x10(%0),%0                  \n"
       "sub        $0x10,%2                     \n"
-      "ja         1b                           \n"
+      "jg         1b                           \n"
       "mov        -0x1(%0),%%al                \n"
       "mov        %%al,(%0)                    \n"
       : "+r"(dst_ptr),     // %0
@@ -2715,7 +2715,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
       "movdqa     %%xmm0,(%0)                  \n"
       "lea        0x10(%0),%0                  \n"
       "sub        $0x10,%2                     \n"
-      "ja         1b                           \n"
+      "jg         1b                           \n"
       "mov        -0x1(%0),%%al                \n"
       "mov        %%al,(%0)                    \n"
       : "+r"(dst_ptr),     // %0
@@ -2750,7 +2750,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
       "movdqa     %%xmm0,(%0)                  \n"
       "lea        0x10(%0),%0                  \n"
       "sub        $0x10,%2                     \n"
-      "ja         1b                           \n"
+      "jg         1b                           \n"
       "mov        -0x1(%0),%%al                \n"
       "mov        %%al,(%0)                    \n"
       : "+r"(dst_ptr),     // %0