diff --git a/README.chromium b/README.chromium index 97118c926..2793fa796 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 485 +Version: 486 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index c03128ba0..2a1468d8d 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 485 +#define LIBYUV_VERSION 486 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/scale.cc b/source/scale.cc index e6484afbd..b6eaa3527 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1035,24 +1035,26 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - sub edi, esi shr eax, 1 cmp eax, 0 // dispatch to specialized filters if applicable. je xloop100 + sub edi, esi cmp eax, 32 je xloop75 cmp eax, 64 je xloop50 cmp eax, 96 je xloop25 - movd xmm0, eax // high fraction 0..127 + + movd xmm0, eax // high fraction 1..127. neg eax add eax, 128 - movd xmm5, eax // low fraction 128..1 + movd xmm5, eax // low fraction 127..1. punpcklbw xmm5, xmm0 punpcklwd xmm5, xmm5 pshufd xmm5, xmm5, 0 + // General purpose row blend. align 16 xloop: movdqa xmm0, [esi] @@ -1069,71 +1071,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, movdqa [esi + edi], xmm0 lea esi, [esi + 16] jg xloop - - punpckhbw xmm0, xmm0 // duplicate last pixel for filtering - pshufhw xmm0, xmm0, 0xff - punpckhqdq xmm0, xmm0 - movdqa [esi + edi], xmm0 - - pop edi - pop esi - ret - - // Blend 100 / 0 - Copy row unchanged. - align 16 - xloop100: - movdqa xmm0, [esi] - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop100 - - punpckhbw xmm0, xmm0 - pshufhw xmm0, xmm0, 0xff - punpckhqdq xmm0, xmm0 - movdqa [esi + edi], xmm0 - pop edi - pop esi - ret - - // Blend 75 / 25. - align 16 - xloop75: - movdqa xmm1, [esi] - movdqa xmm0, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop75 - - punpckhbw xmm0, xmm0 - pshufhw xmm0, xmm0, 0xff - punpckhqdq xmm0, xmm0 - movdqa [esi + edi], xmm0 - pop edi - pop esi - ret - - // Blend 50 / 50. - align 16 - xloop50: - movdqa xmm0, [esi] - movdqa xmm1, [esi + edx] - pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop50 - - punpckhbw xmm0, xmm0 - pshufhw xmm0, xmm0, 0xff - punpckhqdq xmm0, xmm0 - movdqa [esi + edi], xmm0 - pop edi - pop esi - ret + jmp xloop99 // Blend 25 / 75. align 16 @@ -1146,7 +1084,44 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, movdqa [esi + edi], xmm0 lea esi, [esi + 16] jg xloop25 + jmp xloop99 + // Blend 50 / 50. + align 16 + xloop50: + movdqa xmm0, [esi] + movdqa xmm1, [esi + edx] + pavgb xmm0, xmm1 + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + align 16 + xloop75: + movdqa xmm1, [esi] + movdqa xmm0, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + align 16 + xloop100: + movdqa xmm0, [esi] + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop100 + + // Extrude last pixel. + xloop99: punpckhbw xmm0, xmm0 pshufhw xmm0, xmm0, 0xff punpckhqdq xmm0, xmm0 @@ -1154,7 +1129,6 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, pop edi pop esi ret - } } @@ -1171,29 +1145,31 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - sub edi, esi shr eax, 1 - cmp eax, 0 + cmp eax, 0 // dispatch to specialized filters if applicable. je xloop100 + sub edi, esi cmp eax, 32 je xloop75 cmp eax, 64 je xloop50 cmp eax, 96 je xloop25 - movd xmm0, eax // high fraction 0..127 + + movd xmm0, eax // high fraction 1..127. neg eax add eax, 128 - movd xmm5, eax // low fraction 128..1 + movd xmm5, eax // low fraction 127..1. punpcklbw xmm5, xmm0 punpcklwd xmm5, xmm5 pshufd xmm5, xmm5, 0 + // General purpose row blend. align 16 xloop: movdqu xmm0, [esi] movdqu xmm2, [esi + edx] - movdqu xmm1, xmm0 + movdqa xmm1, xmm0 punpcklbw xmm0, xmm2 punpckhbw xmm1, xmm2 pmaddubsw xmm0, xmm5 @@ -1205,71 +1181,7 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, movdqu [esi + edi], xmm0 lea esi, [esi + 16] jg xloop - - punpckhbw xmm0, xmm0 // duplicate last pixel for filtering - pshufhw xmm0, xmm0, 0xff - punpckhqdq xmm0, xmm0 - movdqu [esi + edi], xmm0 - - pop edi - pop esi - ret - - // Blend 100 / 0 - Copy row unchanged. - align 16 - xloop100: - movdqu xmm0, [esi] - sub ecx, 16 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop100 - - punpckhbw xmm0, xmm0 - pshufhw xmm0, xmm0, 0xff - punpckhqdq xmm0, xmm0 - movdqu [esi + edi], xmm0 - pop edi - pop esi - ret - - // Blend 75 / 25. - align 16 - xloop75: - movdqu xmm1, [esi] - movdqu xmm0, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - sub ecx, 16 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop75 - - punpckhbw xmm0, xmm0 - pshufhw xmm0, xmm0, 0xff - punpckhqdq xmm0, xmm0 - movdqu [esi + edi], xmm0 - pop edi - pop esi - ret - - // Blend 50 / 50. - align 16 - xloop50: - movdqu xmm0, [esi] - movdqu xmm1, [esi + edx] - pavgb xmm0, xmm1 - sub ecx, 16 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop50 - - punpckhbw xmm0, xmm0 - pshufhw xmm0, xmm0, 0xff - punpckhqdq xmm0, xmm0 - movdqu [esi + edi], xmm0 - pop edi - pop esi - ret + jmp xloop99 // Blend 25 / 75. align 16 @@ -1282,7 +1194,44 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, movdqu [esi + edi], xmm0 lea esi, [esi + 16] jg xloop25 + jmp xloop99 + // Blend 50 / 50. + align 16 + xloop50: + movdqu xmm0, [esi] + movdqu xmm1, [esi + edx] + pavgb xmm0, xmm1 + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + align 16 + xloop75: + movdqu xmm1, [esi] + movdqu xmm0, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + align 16 + xloop100: + movdqu xmm0, [esi] + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop100 + + // Extrude last pixel. + xloop99: punpckhbw xmm0, xmm0 pshufhw xmm0, xmm0, 0xff punpckhqdq xmm0, xmm0 @@ -1290,7 +1239,6 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, pop edi pop esi ret - } } @@ -2068,9 +2016,13 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, "sub %1,%0 \n" "shr %3 \n" "cmp $0x0,%3 \n" - "je 2f \n" + "je 100f \n" + "cmp $0x20,%3 \n" + "je 75f \n" "cmp $0x40,%3 \n" - "je 3f \n" + "je 50f \n" + "cmp $0x60,%3 \n" + "je 25f \n" "movd %3,%%xmm0 \n" "neg %3 \n" "add $0x80,%3 \n" @@ -2078,6 +2030,8 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, "punpcklbw %%xmm0,%%xmm5 \n" "punpcklwd %%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" + + // General purpose row blend. ".p2align 4 \n" "1: \n" "movdqa (%1),%%xmm0 \n" @@ -2094,25 +2048,57 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, "movdqa %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 1b \n" - "jmp 4f \n" + "jmp 99f \n" + + // Blend 25 / 75. ".p2align 4 \n" - "2: \n" + "25: \n" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 25b \n" + "jmp 99f \n" + + // Blend 50 / 50. + ".p2align 4 \n" + "50: \n" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 75 / 25. + ".p2align 4 \n" + "75: \n" + "movdqa (%1),%%xmm1 \n" + "movdqa (%1,%4,1),%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 75b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + ".p2align 4 \n" + "100: \n" "movdqa (%1),%%xmm0 \n" "sub $0x10,%2 \n" "movdqa %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" - "jg 2b \n" - "jmp 4f \n" - ".p2align 4 \n" - "3: \n" - "movdqa (%1),%%xmm0 \n" - "pavgb (%1,%4,1),%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 3b \n" - ".p2align 4 \n" - "4: \n" + "jg 100b \n" + + // Extrude last pixel. + "99: \n" "punpckhbw %%xmm0,%%xmm0 \n" "pshufhw $0xff,%%xmm0,%%xmm0 \n" "punpckhqdq %%xmm0,%%xmm0 \n" @@ -2137,9 +2123,13 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, "sub %1,%0 \n" "shr %3 \n" "cmp $0x0,%3 \n" - "je 2f \n" + "je 100f \n" + "cmp $0x20,%3 \n" + "je 75f \n" "cmp $0x40,%3 \n" - "je 3f \n" + "je 50f \n" + "cmp $0x60,%3 \n" + "je 25f \n" "movd %3,%%xmm0 \n" "neg %3 \n" "add $0x80,%3 \n" @@ -2147,11 +2137,13 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, "punpcklbw %%xmm0,%%xmm5 \n" "punpcklwd %%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" + + // General purpose row blend. ".p2align 4 \n" "1: \n" "movdqu (%1),%%xmm0 \n" "movdqu (%1,%4,1),%%xmm2 \n" - "movdqu %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm0 \n" "punpckhbw %%xmm2,%%xmm1 \n" "pmaddubsw %%xmm5,%%xmm0 \n" @@ -2163,25 +2155,57 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, "movdqu %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 1b \n" - "jmp 4f \n" + "jmp 99f \n" + + // Blend 25 / 75. ".p2align 4 \n" - "2: \n" + "25: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu (%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 25b \n" + "jmp 99f \n" + + // Blend 50 / 50. + ".p2align 4 \n" + "50: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu (%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 75 / 25. + ".p2align 4 \n" + "75: \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%1,%4,1),%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 75b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + ".p2align 4 \n" + "100: \n" "movdqu (%1),%%xmm0 \n" "sub $0x10,%2 \n" "movdqu %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" - "jg 2b \n" - "jmp 4f \n" - ".p2align 4 \n" - "3: \n" - "movdqu (%1),%%xmm0 \n" - "pavgb (%1,%4,1),%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 3b \n" - ".p2align 4 \n" - "4: \n" + "jg 100b \n" + + // Extrude last pixel. + "99: \n" "punpckhbw %%xmm0,%%xmm0 \n" "pshufhw $0xff,%%xmm0,%%xmm0 \n" "punpckhqdq %%xmm0,%%xmm0 \n" diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 5d4e1ac01..64841937a 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -289,12 +289,17 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - sub edi, esi shr eax, 1 - cmp eax, 0 - je xloop1 + cmp eax, 0 // dispatch to specialized filters if applicable. + je xloop100 + sub edi, esi + cmp eax, 32 + je xloop75 cmp eax, 64 - je xloop2 + je xloop50 + cmp eax, 96 + je xloop25 + movd xmm0, eax // high fraction 0..127 neg eax add eax, 128 @@ -319,36 +324,57 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, movdqa [esi + edi], xmm0 lea esi, [esi + 16] jg xloop + jmp xloop99 - shufps xmm0, xmm0, 0xff - movdqa [esi + edi], xmm0 // duplicate last pixel for filtering - pop edi - pop esi - ret - + // Blend 25 / 75. align 16 - xloop1: + xloop25: + movdqa xmm0, [esi] + movdqa xmm1, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop25 + jmp xloop99 + + // Blend 50 / 50. + align 16 + xloop50: + movdqa xmm0, [esi] + movdqa xmm1, [esi + edx] + pavgb xmm0, xmm1 + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + align 16 + xloop75: + movdqa xmm1, [esi] + movdqa xmm0, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + align 16 + xloop100: movdqa xmm0, [esi] sub ecx, 4 movdqa [esi + edi], xmm0 lea esi, [esi + 16] - jg xloop1 - - shufps xmm0, xmm0, 0xff - movdqa [esi + edi], xmm0 - pop edi - pop esi - ret - - align 16 - xloop2: - movdqa xmm0, [esi] - pavgb xmm0, [esi + edx] - sub ecx, 4 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop2 + jg xloop100 + // Extrude last pixel. + xloop99: shufps xmm0, xmm0, 0xff movdqa [esi + edi], xmm0 pop edi @@ -585,12 +611,17 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { asm volatile ( - "sub %1,%0 \n" "shr %3 \n" "cmp $0x0,%3 \n" - "je 2f \n" + "je 100f \n" + "sub %1,%0 \n" + "cmp $0x20,%3 \n" + "je 75f \n" "cmp $0x40,%3 \n" - "je 3f \n" + "je 50f \n" + "cmp $0x60,%3 \n" + "je 25f \n" + "movd %3,%%xmm0 \n" "neg %3 \n" "add $0x80,%3 \n" @@ -598,6 +629,8 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "punpcklbw %%xmm0,%%xmm5 \n" "punpcklwd %%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" + + // General purpose row blend. ".p2align 4 \n" "1: \n" "movdqa (%1),%%xmm0 \n" @@ -614,30 +647,62 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "movdqa %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 1b \n" - "jmp 4f \n" + "jmp 99f \n" + + // Blend 25 / 75. ".p2align 4 \n" - "2: \n" + "25: \n" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 25b \n" + "jmp 99f \n" + + // Blend 50 / 50. + ".p2align 4 \n" + "50: \n" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 75 / 25. + ".p2align 4 \n" + "75: \n" + "movdqa (%1),%%xmm1 \n" + "movdqa (%1,%4,1),%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 75b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + ".p2align 4 \n" + "100: \n" "movdqa (%1),%%xmm0 \n" "sub $0x4,%2 \n" "movdqa %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" - "jg 2b \n" - "jmp 4f \n" - ".p2align 4 \n" - "3: \n" - "movdqa (%1),%%xmm0 \n" - "pavgb (%1,%4,1),%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 3b \n" - "4: \n" - ".p2align 4 \n" + "jg 100b \n" + + // Extrude last pixel. + "99: \n" "shufps $0xff,%%xmm0,%%xmm0 \n" "movdqa %%xmm0,(%1,%0,1) \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 "+r"(source_y_fraction) // %3 : "r"(static_cast(src_stride)) // %4 : "memory", "cc" @@ -645,6 +710,7 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, , "xmm0", "xmm1", "xmm2", "xmm5" #endif ); + } #endif // defined(__x86_64__) || defined(__i386__) diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 4af3c1554..f521c63d8 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -477,14 +477,19 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, int dst_width, int source_y_fraction) { asm volatile ( "cmp %4, #0 \n" - "beq 2f \n" + "beq 100f \n" "add %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" "cmp %4, #128 \n" - "beq 3f \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" "vdup.8 d5, %4 \n" "rsb %4, #256 \n" "vdup.8 d4, %4 \n" + // General purpose row blend. "1: \n" "vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q1}, [%2]! \n" @@ -497,23 +502,48 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "vrshrn.u16 d1, q14, #8 \n" "vst1.u8 {q0}, [%0]! \n" "bgt 1b \n" - "b 4f \n" + "b 99f \n" - "2: \n" + // Blend 25 / 75. + "25: \n" "vld1.u8 {q0}, [%1]! \n" + "vld1.u8 {q1}, [%2]! \n" "subs %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" "vst1.u8 {q0}, [%0]! \n" - "bgt 2b \n" - "b 4f \n" + "bgt 25b \n" + "b 99f \n" - "3: \n" + // Blend 50 / 50. + "50: \n" "vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q1}, [%2]! \n" "subs %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vst1.u8 {q0}, [%0]! \n" - "bgt 3b \n" - "4: \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + "vld1.u8 {q1}, [%1]! \n" + "vld1.u8 {q0}, [%2]! \n" + "subs %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.u8 {q0}, [%0]! \n" + "bgt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "vld1.u8 {q0}, [%1]! \n" + "subs %3, #16 \n" + "vst1.u8 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" "vst1.u8 {d1[7]}, [%0] \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1