p2align all loops, copy stride to local for scale, and copy last byte in bilinear more efficiently

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/547007

git-svn-id: http://libyuv.googlecode.com/svn/trunk@255 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
fbarchard@google.com 2012-05-02 00:10:16 +00:00
parent f906ae1360
commit 5bf29b59db
10 changed files with 199 additions and 78 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 254
Version: 255
License: BSD
License File: LICENSE

View File

@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 254
#define LIBYUV_VERSION 255
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -163,6 +163,7 @@ static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
"movd %2,%%xmm0 \n"
"pxor %%xmm7,%%xmm7 \n"
"movdqa %4,%%xmm6 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm1 \n"
"lea 0x10(%0),%0 \n"
@ -331,7 +332,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
"pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm5,%%xmm5 \n"
"sub %0,%1 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm1 \n"
"movdqa (%0,%1,1),%%xmm2 \n"

View File

@ -74,6 +74,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
mov ecx, [esp + 4 + 16] // pix
sub edi, eax
align 16
convertloop:
movdqa xmm0, [eax]
pavgb xmm0, [eax + edx]
@ -92,6 +93,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
uint8* dst_uv, int pix) {
asm volatile (
"sub %0,%1 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"pavgb (%0,%3),%%xmm0 \n"
@ -467,6 +469,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
align 16
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@ -506,6 +509,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"

View File

@ -291,6 +291,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
uint8* dst_frame, int width) {
asm volatile (
"sub %1,%2 \n"
".p2align 4 \n"
"1: \n"
"movq (%1),%%xmm2 \n"
"movq (%1,%2,1),%%xmm3 \n"
@ -326,6 +327,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
uint8* dst_frame, int width) {
asm volatile (
"sub %1,%2 \n"
".p2align 4 \n"
"1: \n"
"movq (%1),%%xmm2 \n"
"movq (%1,%2,1),%%xmm3 \n"

View File

@ -57,6 +57,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
asm volatile (
"movd %3,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea 0x10(%0),%0 \n"

View File

@ -288,6 +288,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 4 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
@ -499,6 +500,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa (%0,%3),%%xmm1 \n"
@ -639,6 +641,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa (%0,%4),%%xmm1 \n"

View File

@ -32,6 +32,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"sub %4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
".p2align 4 \n"
"1: \n"
"mov r9, %0 \n"
@ -198,6 +199,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"sub %6, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
".p2align 4 \n"
"1: \n"
"mov r9, %0 \n"

View File

@ -112,6 +112,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0x18,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"lea 0x8(%0),%0 \n"
@ -141,6 +142,7 @@ void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
asm volatile (
"movdqa %3,%%xmm5 \n"
"sub %0,%1 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"pshufb %%xmm5,%%xmm0 \n"
@ -164,6 +166,7 @@ void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
asm volatile (
"movdqa %3,%%xmm5 \n"
"sub %0,%1 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"pshufb %%xmm5,%%xmm0 \n"
@ -187,6 +190,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
"pslld $0x18,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -227,6 +231,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
"pslld $0x18,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -279,6 +284,7 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"psllw $0x8,%%xmm7 \n"
"sub %0,%1 \n"
"sub %0,%1 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
@ -327,6 +333,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"psllw $0x8,%%xmm7 \n"
"sub %0,%1 \n"
"sub %0,%1 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
@ -372,6 +379,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"pslld $0x4,%%xmm5 \n"
"sub %0,%1 \n"
"sub %0,%1 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqa %%xmm0,%%xmm2 \n"
@ -405,6 +413,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
asm volatile (
"movdqa %3,%%xmm6 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -445,6 +454,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
asm volatile (
"movdqa %3,%%xmm6 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -491,6 +501,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
"pslld $0x5,%%xmm4 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0xb,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
@ -531,6 +542,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
"pslld $0xa,%%xmm6 \n"
"pcmpeqb %%xmm7,%%xmm7 \n"
"pslld $0xf,%%xmm7 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
@ -570,6 +582,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
"psllw $0xc,%%xmm4 \n"
"movdqa %%xmm4,%%xmm3 \n"
"psrlw $0x8,%%xmm3 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
@ -599,6 +612,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -635,6 +649,7 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -689,6 +704,7 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
);
asm volatile (
"sub %1,%2 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -753,6 +769,7 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
);
asm volatile (
"sub %1,%2 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -808,6 +825,7 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -844,6 +862,7 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -893,6 +912,7 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
);
asm volatile (
"sub %1,%2 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -957,6 +977,7 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
);
asm volatile (
"sub %1,%2 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -1012,6 +1033,7 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -1048,6 +1070,7 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -1097,6 +1120,7 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
);
asm volatile (
"sub %1,%2 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -1161,6 +1185,7 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
);
asm volatile (
"sub %1,%2 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -1295,6 +1320,7 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUVTORGB
"punpcklbw %%xmm1,%%xmm0 \n"
@ -1329,6 +1355,7 @@ void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUVTORGB
"pcmpeqb %%xmm5,%%xmm5 \n"
@ -1364,6 +1391,7 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUVTORGB
"punpcklbw %%xmm1,%%xmm2 \n"
@ -1398,6 +1426,7 @@ void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUVTORGB
"punpcklbw %%xmm1,%%xmm0 \n"
@ -1432,6 +1461,7 @@ void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUVTORGB
"pcmpeqb %%xmm5,%%xmm5 \n"
@ -1467,6 +1497,7 @@ void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUVTORGB
"punpcklbw %%xmm1,%%xmm2 \n"
@ -1501,6 +1532,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
"movd (%1),%%xmm0 \n"
"movd (%1,%2,1),%%xmm1 \n"
@ -1562,6 +1594,7 @@ void YToARGBRow_SSE2(const uint8* y_buf,
"mov $0x012a012a,%%eax \n"
"movd %%eax,%%xmm2 \n"
"pshufd $0x0,%%xmm2,%%xmm2 \n"
".p2align 4 \n"
"1: \n"
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
"movq (%0),%%xmm0 \n"
@ -1607,6 +1640,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
asm volatile (
"movdqa %3,%%xmm5 \n"
"lea -0x10(%0),%0 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0,%2),%%xmm0 \n"
"pshufb %%xmm5,%%xmm0 \n"
@ -1631,6 +1665,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
"lea -0x10(%0),%0 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0,%2),%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
@ -1668,6 +1703,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
"movdqa %4,%%xmm1 \n"
"lea -16(%0,%3,2),%0 \n"
"sub %1,%2 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea -16(%0),%0 \n"
@ -1695,6 +1731,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
void AddRow_SSE2(const uint8* src, uint16* dst, int width) {
asm volatile (
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm2 \n"
"lea 0x10(%0),%0 \n"
@ -1725,6 +1762,7 @@ void AddRow_SSE2(const uint8* src, uint16* dst, int width) {
void SubRow_SSE2(const uint8* src, uint16* dst, int width) {
asm volatile (
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm2 \n"
"lea 0x10(%0),%0 \n"
@ -1758,6 +1796,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -1833,6 +1872,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -1861,6 +1901,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -1900,6 +1941,7 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -1930,6 +1972,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -1965,6 +2008,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -1993,6 +2037,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -2029,6 +2074,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
uint8* dst_y, int pix) {
asm volatile (
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -2057,6 +2103,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -2109,6 +2156,7 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"pslld $0x18,%%xmm4 \n"
// 8 pixel loop
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm3 \n"
"movdqa %%xmm3,%%xmm0 \n"
@ -2184,6 +2232,7 @@ void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"pslld $0x18,%%xmm4 \n"
// 1 pixel loop
".p2align 4 \n"
"1: \n"
"movd (%0),%%xmm3 \n"
"lea 0x4(%0),%0 \n"
@ -2241,6 +2290,7 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"pslld $0x18,%%xmm4 \n"
// 8 pixel loop
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm3 \n"
"movdqa %%xmm3,%%xmm0 \n"
@ -2313,6 +2363,7 @@ void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"pslld $0x18,%%xmm4 \n"
// 1 pixel loop
".p2align 4 \n"
"1: \n"
"movd (%0),%%xmm3 \n"
"lea 0x4(%0),%0 \n"
@ -2361,6 +2412,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
"psrld $0x8,%%xmm5 \n"
// 4 pixel loop
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
@ -2415,6 +2467,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
"movdqa %4,%%xmm5 \n"
// 4 pixel loop
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"pshufb %%xmm4,%%xmm0 \n"
@ -2503,6 +2556,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"pslld $0x18,%%xmm4 \n"
// 4 pixel loop
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movzb 0x3(%0),%3 \n"

View File

@ -1457,8 +1457,10 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
lea esi, [esi + 16]
jg xloop
mov al, [esi + edi - 1]
mov [esi + edi], al
punpckhbw xmm0, xmm0 // duplicate last pixel to allow horizontal filtering
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
movdqa [esi + edi], xmm0
pop edi
pop esi
ret
@ -1471,8 +1473,10 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
lea esi, [esi + 16]
jg xloop1
mov al, [esi + edi - 1]
mov [esi + edi], al
punpckhbw xmm0, xmm0 // duplicate last pixel to allow horizontal filtering
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
movdqa [esi + edi], xmm0
pop edi
pop esi
ret
@ -1486,8 +1490,10 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
lea esi, [esi + 16]
jg xloop2
mov al, [esi + edi - 1]
mov [esi + edi], al
punpckhbw xmm0, xmm0 // duplicate last pixel to allow horizontal filtering
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
movdqa [esi + edi], xmm0
pop edi
pop esi
ret
@ -1538,8 +1544,11 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
lea esi, [esi + 16]
jg xloop
mov al, [esi + edi - 1]
mov [esi + edi], al
punpckhbw xmm0, xmm0 // duplicate last pixel to allow horizontal filtering
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
movdqa [esi + edi], xmm0
pop edi
pop esi
ret
@ -1552,8 +1561,10 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
lea esi, [esi + 16]
jg xloop1
mov al, [esi + edi - 1]
mov [esi + edi], al
punpckhbw xmm0, xmm0
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
movdqa [esi + edi], xmm0
pop edi
pop esi
ret
@ -1567,8 +1578,10 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
lea esi, [esi + 16]
jg xloop2
mov al, [esi + edi - 1]
mov [esi + edi], al
punpckhbw xmm0, xmm0
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
movdqa [esi + edi], xmm0
pop edi
pop esi
ret
@ -1634,6 +1647,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -1658,6 +1672,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -1692,6 +1707,7 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, int src_stride,
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -1717,6 +1733,7 @@ static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -1752,6 +1769,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrld $0x18,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -1779,6 +1797,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $0x8,%%xmm7 \n"
"lea (%4,%4,2),%3 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -1831,6 +1850,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlq $0x38,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -1860,6 +1880,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
asm volatile (
"pxor %%xmm4,%%xmm4 \n"
"sub $0x1,%5 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"mov %0,%3 \n"
@ -2284,8 +2305,10 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
"lea 0x10(%esi),%esi \n"
"jg 1b \n"
"mov -0x1(%esi,%edi,1),%al \n"
"mov %al,(%esi,%edi,1) \n"
"punpckhbw %xmm0,%xmm0 \n"
"pshufhw $0xff,%xmm0,%xmm0 \n"
"punpckhqdq %xmm0,%xmm0 \n"
"movdqa %xmm0,(%esi,%edi,1) \n"
"pop %edi \n"
"pop %esi \n"
"ret \n"
@ -2297,8 +2320,10 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
"lea 0x10(%esi),%esi \n"
"jg 2b \n"
"mov -0x1(%esi,%edi,1),%al \n"
"mov %al,(%esi,%edi,1) \n"
"punpckhbw %xmm0,%xmm0 \n"
"pshufhw $0xff,%xmm0,%xmm0 \n"
"punpckhqdq %xmm0,%xmm0 \n"
"movdqa %xmm0,(%esi,%edi,1) \n"
"pop %edi \n"
"pop %esi \n"
"ret \n"
@ -2311,8 +2336,10 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
"lea 0x10(%esi),%esi \n"
"jg 3b \n"
"mov -0x1(%esi,%edi,1),%al \n"
"mov %al,(%esi,%edi,1) \n"
"punpckhbw %xmm0,%xmm0 \n"
"pshufhw $0xff,%xmm0,%xmm0 \n"
"punpckhqdq %xmm0,%xmm0 \n"
"movdqa %xmm0,(%esi,%edi,1) \n"
"pop %edi \n"
"pop %esi \n"
"ret \n"
@ -2361,8 +2388,10 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"lea 0x10(%esi),%esi \n"
"jg 1b \n"
"mov -0x1(%esi,%edi,1),%al \n"
"mov %al,(%esi,%edi,1) \n"
"punpckhbw %xmm0,%xmm0 \n"
"pshufhw $0xff,%xmm0,%xmm0 \n"
"punpckhqdq %xmm0,%xmm0 \n"
"movdqa %xmm0,(%esi,%edi,1) \n"
"pop %edi \n"
"pop %esi \n"
"ret \n"
@ -2374,8 +2403,10 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"lea 0x10(%esi),%esi \n"
"jg 2b \n"
"mov -0x1(%esi,%edi,1),%al \n"
"mov %al,(%esi,%edi,1) \n"
"punpckhbw %xmm0,%xmm0 \n"
"pshufhw $0xff,%xmm0,%xmm0 \n"
"punpckhqdq %xmm0,%xmm0 \n"
"movdqa %xmm0,(%esi,%edi,1) \n"
"pop %edi \n"
"pop %esi \n"
"ret \n"
@ -2388,8 +2419,10 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"lea 0x10(%esi),%esi \n"
"jg 3b \n"
"mov -0x1(%esi,%edi,1),%al \n"
"mov %al,(%esi,%edi,1) \n"
"punpckhbw %xmm0,%xmm0 \n"
"pshufhw $0xff,%xmm0,%xmm0 \n"
"punpckhqdq %xmm0,%xmm0 \n"
"movdqa %xmm0,(%esi,%edi,1) \n"
"pop %edi \n"
"pop %esi \n"
"ret \n"
@ -2401,6 +2434,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
asm volatile (
"lea (%3,%3,2),%%r10 \n"
"pxor %%xmm7,%%xmm7 \n"
".p2align 4 \n"
"1:"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -2461,6 +2495,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
"movdqa (%3),%%xmm3 \n"
"movdqa (%4),%%xmm4 \n"
"movdqa (%5),%%xmm5 \n"
".p2align 4 \n"
"1:"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm2 \n"
@ -2496,6 +2531,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
"movdqa (%8),%%xmm6 \n" // _madd11
"movdqa (%9),%%xmm7 \n" // _round34
"movdqa (%10),%%xmm8 \n" // _madd21
".p2align 4 \n"
"1:"
"movdqa (%0),%%xmm0 \n"
"movdqa (%0,%3),%%xmm1 \n"
@ -2553,6 +2589,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
"movdqa (%8),%%xmm6 \n" // _madd11
"movdqa (%9),%%xmm7 \n" // _round34
"movdqa (%10),%%xmm8 \n" // _madd21
".p2align 4 \n"
"1:"
"movdqa (%0),%%xmm0 \n"
"movdqa (%0,%3,1),%%xmm1 \n"
@ -2609,6 +2646,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
asm volatile (
"movdqa (%3),%%xmm4 \n"
"movdqa (%4),%%xmm5 \n"
".p2align 4 \n"
"1:"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@ -2638,6 +2676,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
"movdqa (%5),%%xmm5 \n"
"movdqa (%6),%%xmm6 \n"
"pxor %%xmm7,%%xmm7 \n"
".p2align 4 \n"
"1:"
"movdqa (%0),%%xmm0 \n"
"movdqa (%0,%3,1),%%xmm2 \n"
@ -2695,6 +2734,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
"movdqa (%5),%%xmm5 \n"
"movdqa (%6),%%xmm6 \n"
"movdqa (%7),%%xmm7 \n"
".p2align 4 \n"
"1:"
"movdqa (%0),%%xmm2 \n"
"pavgb (%0,%3,1),%%xmm2 \n"
@ -2733,6 +2773,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
int dst_width, int source_y_fraction) {
if (source_y_fraction == 0) {
asm volatile (
".p2align 4 \n"
"1:"
"movdqa (%1),%%xmm0 \n"
"lea 0x10(%1),%1 \n"
@ -2751,6 +2792,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
return;
} else if (source_y_fraction == 128) {
asm volatile (
".p2align 4 \n"
"1:"
"movdqa (%1),%%xmm0 \n"
"movdqa (%1,%3,1),%%xmm2 \n"
@ -2781,6 +2823,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
"punpcklwd %%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
"pxor %%xmm7,%%xmm7 \n"
".p2align 4 \n"
"1:"
"movdqa (%1),%%xmm0 \n"
"movdqa (%1,%4,1),%%xmm2 \n"
@ -2824,6 +2867,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
int dst_width, int source_y_fraction) {
if (source_y_fraction <= 1) {
asm volatile (
".p2align 4 \n"
"1:"
"movdqa (%1),%%xmm0 \n"
"lea 0x10(%1),%1 \n"
@ -2842,6 +2886,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
return;
} else if (source_y_fraction == 128) {
asm volatile (
".p2align 4 \n"
"1:"
"movdqa (%1),%%xmm0 \n"
"movdqa (%1,%3,1),%%xmm2 \n"
@ -2870,6 +2915,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"movd %%eax,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
".p2align 4 \n"
"1:"
"movdqa (%1),%%xmm0 \n"
"movdqa (%1,%4,1),%%xmm2 \n"
@ -2904,12 +2950,13 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
// CPU agnostic row functions
static void ScaleRowDown2_C(const uint8* src_ptr, int,
uint8* dst, int dst_width) {
for (int x = 0; x < dst_width - 1; x += 2) {
uint8* dend = dst + dst_width - 1;
do {
dst[0] = src_ptr[0];
dst[1] = src_ptr[2];
dst += 2;
src_ptr += 4;
}
} while (dst < dend);
if (dst_width & 1) {
dst[0] = src_ptr[0];
}
@ -2917,28 +2964,30 @@ static void ScaleRowDown2_C(const uint8* src_ptr, int,
void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,
uint8* dst, int dst_width) {
for (int x = 0; x < dst_width - 1; x += 2) {
dst[0] = (src_ptr[0] + src_ptr[1] +
src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
dst[1] = (src_ptr[2] + src_ptr[3] +
src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + 2) >> 2;
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
uint8* dend = dst + dst_width - 1;
do {
dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
dst += 2;
src_ptr += 4;
}
s += 4;
t += 4;
} while (dst < dend);
if (dst_width & 1) {
dst[0] = (src_ptr[0] + src_ptr[1] +
src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
}
}
static void ScaleRowDown4_C(const uint8* src_ptr, int,
uint8* dst, int dst_width) {
for (int x = 0; x < dst_width - 1; x += 2) {
uint8* dend = dst + dst_width - 1;
do {
dst[0] = src_ptr[0];
dst[1] = src_ptr[4];
dst += 2;
src_ptr += 8;
}
} while (dst < dend);
if (dst_width & 1) {
dst[0] = src_ptr[0];
}
@ -2946,34 +2995,36 @@ static void ScaleRowDown4_C(const uint8* src_ptr, int,
static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
uint8* dst, int dst_width) {
for (int x = 0; x < dst_width - 1; x += 2) {
intptr_t stride = src_stride;
uint8* dend = dst + dst_width - 1;
do {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2] + src_ptr[stride + 3] +
src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
8) >> 4;
dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
src_ptr[src_stride + 4] + src_ptr[src_stride + 5] +
src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5] +
src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7] +
src_ptr[src_stride * 3 + 4] + src_ptr[src_stride * 3 + 5] +
src_ptr[src_stride * 3 + 6] + src_ptr[src_stride * 3 + 7] +
src_ptr[stride + 4] + src_ptr[stride + 5] +
src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
8) >> 4;
dst += 2;
src_ptr += 8;
}
} while (dst < dend);
if (dst_width & 1) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2] + src_ptr[stride + 3] +
src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
8) >> 4;
}
}
@ -2985,12 +3036,13 @@ static const int kMaxRow12 = kMaxOutputWidth * 2;
static void ScaleRowDown8_C(const uint8* src_ptr, int,
uint8* dst, int dst_width) {
for (int x = 0; x < dst_width - 1; x += 2) {
uint8* dend = dst + dst_width - 1;
do {
dst[0] = src_ptr[0];
dst[1] = src_ptr[8];
dst += 2;
src_ptr += 16;
}
} while (dst < dend);
if (dst_width & 1) {
dst[0] = src_ptr[0];
}
@ -3026,9 +3078,9 @@ static void ScaleRowDown34_C(const uint8* src_ptr, int,
static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
uint8* d, int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
uint8* dend = d + dst_width;
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
uint8* dend = d + dst_width;
do {
uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
@ -3049,9 +3101,9 @@ static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
uint8* d, int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
uint8* dend = d + dst_width;
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
uint8* dend = d + dst_width;
do {
uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
@ -3073,8 +3125,8 @@ static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
uint8* dend = dst_ptr + dst_width;
const uint8* s = src_ptr;
uint8* dend = dst_ptr + dst_width;
do {
dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
@ -3150,20 +3202,21 @@ static void ScaleRowDown38_C(const uint8* src_ptr, int,
static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
intptr_t stride = src_stride;
for (int i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
(65536 / 9) >> 16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
src_ptr[stride + 3] + src_ptr[stride + 4] +
src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
(65536 / 9) >> 16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *
src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
(65536 / 6) >> 16;
src_ptr += 8;
dst_ptr += 3;
@ -3174,15 +3227,16 @@ static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
intptr_t stride = src_stride;
for (int i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
src_ptr[src_stride + 2]) * (65536 / 6) >> 16;
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2]) * (65536 / 6) >> 16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
src_ptr[src_stride + 5]) * (65536 / 6) >> 16;
src_ptr[stride + 3] + src_ptr[stride + 4] +
src_ptr[stride + 5]) * (65536 / 6) >> 16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *
src_ptr[stride + 6] + src_ptr[stride + 7]) *
(65536 / 4) >> 16;
src_ptr += 8;
dst_ptr += 3;