diff --git a/README.chromium b/README.chromium index 0d8b799cd..afb802c94 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 86 +Version: 89 License: BSD License File: LICENSE diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index 85b423ffc..bc7c26dd1 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -46,6 +46,12 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y, uint8* dst_frame, int dst_stride_frame, int width, int height); +int I422ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + int I420ToUYVY(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 5e2cb500e..884e647f1 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -50,6 +50,24 @@ int I422ToI420(const uint8* src_y, int src_stride_y, uint8* dst_v, int dst_stride_v, int width, int height); +// Convert I422 to I420. +int I420ToI422(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert I444 to I420. +int I444ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + // Convert NV12 to I420. Also used for NV21. int NV12ToI420(const uint8* src_y, int src_stride_y, const uint8* src_uv, int src_stride_uv, diff --git a/source/convert.cc b/source/convert.cc index fd21b96c7..fb5dd58f1 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -1057,22 +1057,13 @@ int ConvertToI420(const uint8* sample, size_t sample_size, case FOURCC_RGGB: case FOURCC_GRBG: case FOURCC_GBRG: - // TODO(fbarchard): We could support cropping by odd numbers by - // adjusting fourcc. + // TODO(fbarchard): Support cropping by odd numbers by adjusting fourcc. src = sample + (src_width * crop_y + crop_x); BayerRGBToI420(src, src_width, format, y, y_stride, u, u_stride, v, v_stride, dst_width, inv_dst_height); break; // Biplanar formats - case FOURCC_M420: - src = sample + (src_width * crop_y) * 12 / 8 + crop_x; - M420ToI420(src, src_width, - y, y_stride, - u, u_stride, - v, v_stride, - dst_width, inv_dst_height); - break; case FOURCC_NV12: src = sample + (src_width * crop_y + crop_x); src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; @@ -1094,6 +1085,14 @@ int ConvertToI420(const uint8* sample, size_t sample_size, v, v_stride, dst_width, inv_dst_height, rotation); break; + case FOURCC_M420: + src = sample + (src_width * crop_y) * 12 / 8 + crop_x; + M420ToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; case FOURCC_Q420: src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x; src_uv = sample + (src_width + aligned_src_width * 2) * crop_y + @@ -1133,6 +1132,33 @@ int ConvertToI420(const uint8* sample, size_t sample_size, dst_width, inv_dst_height, rotation); break; } + // Triplanar formats + case FOURCC_I422: + case FOURCC_YV16: { + const uint8* src_y = sample + (src_width * crop_y + crop_x); + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + if (format == FOURCC_I422) { + src_u = sample + src_width * abs_src_height + + halfwidth * crop_y + crop_x / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } else { + src_v = sample + src_width * abs_src_height + + halfwidth * crop_y + crop_x / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } + I422ToI420(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + } // Formats not supported case FOURCC_MJPG: default: diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 8a3915b76..85ae29e63 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -126,12 +126,75 @@ static void SplitUV_C(const uint8* src_uv, } } -static void I420CopyPlane(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height) { +// CopyRows copys 'count' bytes using a 16 byte load/store, 64 bytes at time +#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) +#define HAS_COPYROW_SSE2 +__declspec(naked) +void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 32 + ja convertloop + ret + } +} +#elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM) +#define HAS_COPYROW_SSE2 +void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { + asm volatile ( +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "ja 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif +); +} +#endif + +void CopyRow_C(const uint8* src, uint8* dst, int count) { + memcpy(dst, src, count); +} + +static void CopyPlane(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + void (*CopyRow)(const uint8* src, uint8* dst, int width); +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + (width % 32 == 0) && + IS_ALIGNED(src_y, 16) && (src_stride_y % 16 == 0)) { + CopyRow = CopyRow_SSE2; + } else +#endif + { + CopyRow = CopyRow_C; + } + // Copy plane for (int y = 0; y < height; ++y) { - memcpy(dst_y, src_y, width); + CopyRow(src_y, dst_y, width); src_y += src_stride_y; dst_y += dst_stride_y; } @@ -150,7 +213,6 @@ int I420Copy(const uint8* src_y, int src_stride_y, width <= 0 || height == 0) { return -1; } - // Negative height means invert the image. if (height < 0) { height = -height; @@ -165,9 +227,9 @@ int I420Copy(const uint8* src_y, int src_stride_y, int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - I420CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); - I420CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); return 0; } @@ -178,52 +240,66 @@ int I420Mirror(const uint8* src_y, int src_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height) { - if (src_y == NULL || src_u == NULL || src_v == NULL || - dst_y == NULL || dst_u == NULL || dst_v == NULL) + if (!src_y || !src_u || !src_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { return -1; + } + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; - // Only accepts positive dimensions - if (height < 0 || width < 0 || src_stride_y < 0 || src_stride_u < 0 || - src_stride_v < 0 || dst_stride_y < 0 || dst_stride_u < 0 || - dst_stride_v < 0) - return -1; - - int indO = 0; - int indS = 0; - int wind, hind; - uint8 tmp_val; - // Will swap two values per iteration - const int half_width = (width + 1) >> 1; - - // Y - for (wind = 0; wind < half_width; ++wind) { - for (hind = 0; hind < height; ++hind) { - indO = hind * src_stride_y + wind; - indS = hind * dst_stride_y + (width - wind - 1); - tmp_val = src_y[indO]; - dst_y[indO] = src_y[indS]; - dst_y[indS] = tmp_val; - } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + void (*ReverseRow)(const uint8* src, uint8* dst, int width); +#if defined(HAS_REVERSE_ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && + (width % 32 == 0)) { + ReverseRow = ReverseRow_NEON; + } else +#endif +#if defined(HAS_REVERSE_ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + (width % 32 == 0) && + IS_ALIGNED(src_y, 16) && (src_stride_y % 16 == 0) && + IS_ALIGNED(src_u, 16) && (src_stride_u % 16 == 0) && + IS_ALIGNED(src_v, 16) && (src_stride_v % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && + IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) && + IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) { + ReverseRow = ReverseRow_SSSE3; + } else +#endif + { + ReverseRow = ReverseRow_C; } - const int half_height = (height + 1) >> 1; - const int half_uv_width = (width + 1) >> 1; - - for (wind = 0; wind < half_uv_width; ++wind) { - for (hind = 0; hind < half_height; ++hind) { - // U - indO = hind * dst_stride_u + wind; - indS = hind * dst_stride_u + (half_uv_width - wind - 1); - tmp_val = src_u[indO]; - dst_u[indO] = src_u[indS]; - dst_u[indS] = tmp_val; - // V - indO = hind * dst_stride_v + wind; - indS = hind * dst_stride_v + (half_uv_width - wind - 1); - tmp_val = src_v[indO]; - dst_v[indO] = src_v[indS]; - dst_v[indS] = tmp_val; - } + // Y Plane + int y; + for (y = 0; y < height; ++y) { + ReverseRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } + // U Plane + for (y = 0; y < halfheight; ++y) { + ReverseRow(src_u, dst_u, halfwidth); + src_u += src_stride_u; + dst_u += dst_stride_u; + } + // V Plane + for (y = 0; y < halfheight; ++y) { + ReverseRow(src_v, dst_v, halfwidth); + src_v += src_stride_v; + dst_v += dst_stride_v; } return 0; } @@ -417,8 +493,6 @@ void HalfRow_C(const uint8* src_uv, int src_uv_stride, } } -// Helper function to copy yuv data without scaling. Used -// by our jpeg conversion callbacks to incrementally fill a yuv image. int I422ToI420(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, @@ -454,7 +528,7 @@ int I422ToI420(const uint8* src_y, int src_stride_y, } // Copy Y plane - I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); // SubSample U plane. int y; @@ -479,7 +553,133 @@ int I422ToI420(const uint8* src_y, int src_stride_y, return 0; } -static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, +int I420ToI422(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (height - 1) * dst_stride_u; + dst_v = dst_v + (height - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + // Copy Y plane + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + + int halfwidth = (width + 1) >> 1; + // UpSample U plane. + int y; + for (y = 0; y < height - 1; y += 2) { + memcpy(dst_u, src_u, halfwidth); + memcpy(dst_u + dst_stride_u, src_u, halfwidth); + src_u += src_stride_u; + dst_u += dst_stride_u * 2; + } + if (height & 1) { + memcpy(dst_u, src_u, halfwidth); + } + + // UpSample V plane. + for (y = 0; y < height - 1; y += 2) { + memcpy(dst_v, src_v, halfwidth); + memcpy(dst_v + dst_stride_v, src_v, halfwidth); + src_v += src_stride_v; + dst_v += dst_stride_v * 2; + } + if (height & 1) { + memcpy(dst_v, src_v, halfwidth); + } + return 0; +} + +// Blends 32x2 pixels to 16x1 +// source in scale.cc +#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) +#define HAS_SCALEROWDOWN2_NEON +void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width); +#elif (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ + !defined(YUV_DISABLE_ASM) +void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); +#endif +void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + +// Half Width and Height +int I444ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + int halfwidth = (width + 1) >> 1; + void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); +#if defined(HAS_SCALEROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON) && + (halfwidth % 16 == 0)) { + ScaleRowDown2 = ScaleRowDown2Int_NEON; + } else +#endif +#if defined(HAS_SCALEROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + (halfwidth % 16 == 0) && + IS_ALIGNED(src_u, 16) && (src_stride_u % 16 == 0) && + IS_ALIGNED(src_v, 16) && (src_stride_v % 16 == 0) && + IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) && + IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) { + ScaleRowDown2 = ScaleRowDown2Int_SSE2; +#endif + { + ScaleRowDown2 = ScaleRowDown2Int_C; + } + + // Copy Y plane + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + + // SubSample U plane. + int y; + for (y = 0; y < height - 1; y += 2) { + ScaleRowDown2(src_u, src_stride_u, dst_u, halfwidth); + src_u += src_stride_u * 2; + dst_u += dst_stride_u; + } + if (height & 1) { + ScaleRowDown2(src_u, 0, dst_u, halfwidth); + } + + // SubSample V plane. + for (y = 0; y < height - 1; y += 2) { + ScaleRowDown2(src_v, src_stride_v, dst_v, halfwidth); + src_v += src_stride_v * 2; + dst_v += dst_stride_v; + } + if (height & 1) { + ScaleRowDown2(src_v, 0, dst_v, halfwidth); + } + return 0; +} + +static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, uint8* dst, int dst_stride_frame, int width, int height) { // Copy plane @@ -544,7 +744,7 @@ static int X420ToI420(const uint8* src_y, SplitUV = SplitUV_C; } - I420CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y, + CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y, width, height); int halfheight = (height + 1) >> 1; @@ -1108,18 +1308,18 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, YUY2ToI420RowY = YUY2ToI420RowY_C; YUY2ToI420RowUV = YUY2ToI420RowUV_C; } - for (int y = 0; y < height; ++y) { - if ((y & 1) == 0) { - if (y >= (height - 1) ) { // last chroma on odd height clamp height - src_stride_yuy2 = 0; - } - YUY2ToI420RowUV(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } + for (int y = 0; y < height - 1; y += 2) { + YUY2ToI420RowUV(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); + dst_u += dst_stride_u; + dst_v += dst_stride_v; + YUY2ToI420RowY(src_yuy2, dst_y, width); + YUY2ToI420RowY(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width); + dst_y += dst_stride_y * 2; + src_yuy2 += src_stride_yuy2 * 2; + } + if (height & 1) { + YUY2ToI420RowUV(src_yuy2, 0, dst_u, dst_v, width); YUY2ToI420RowY(src_yuy2, dst_y, width); - dst_y += dst_stride_y; - src_yuy2 += src_stride_yuy2; } return 0; } @@ -1155,18 +1355,18 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, UYVYToI420RowY = UYVYToI420RowY_C; UYVYToI420RowUV = UYVYToI420RowUV_C; } - for (int y = 0; y < height; ++y) { - if ((y & 1) == 0) { - if (y >= (height - 1) ) { // last chroma on odd height clamp height - src_stride_uyvy = 0; - } - UYVYToI420RowUV(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } + for (int y = 0; y < height - 1; y += 2) { + UYVYToI420RowUV(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); + dst_u += dst_stride_u; + dst_v += dst_stride_v; + UYVYToI420RowY(src_uyvy, dst_y, width); + UYVYToI420RowY(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width); + dst_y += dst_stride_y * 2; + src_uyvy += src_stride_uyvy * 2; + } + if (height & 1) { + UYVYToI420RowUV(src_uyvy, 0, dst_u, dst_v, width); UYVYToI420RowY(src_uyvy, dst_y, width); - dst_y += dst_stride_y; - src_uyvy += src_stride_uyvy; } return 0; } diff --git a/source/rotate.cc b/source/rotate.cc index ad9434b2c..d8faf23c2 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -20,10 +20,6 @@ namespace libyuv { #if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ !defined(YUV_DISABLE_ASM) // Note static const preferred, but gives internal compiler error on gcc 4.2 -// Shuffle table for reversing the bytes. -uvec8 kShuffleReverse = { - 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u -}; // Shuffle table for reversing the bytes of UV channels. uvec8 kShuffleReverseUV = { 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u @@ -31,7 +27,6 @@ uvec8 kShuffleReverseUV = { #endif typedef void (*reverse_uv_func)(const uint8*, uint8*, uint8*, int); -typedef void (*reverse_func)(const uint8*, uint8*, int); typedef void (*rotate_uv_wx8_func)(const uint8*, int, uint8*, int, uint8*, int, int); @@ -844,71 +839,10 @@ void RotatePlane270(const uint8* src, int src_stride, TransposePlane(src, src_stride, dst, dst_stride, width, height); } -static void ReverseRow_C(const uint8* src, uint8* dst, int width) { - int i; - src += width - 1; - for (i = 0; i < width; ++i) { - dst[i] = src[0]; - --src; - } -} - -#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) -#define HAS_REVERSE_ROW_SSSE3 -__declspec(naked) -static void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) { -__asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - movdqa xmm5, kShuffleReverse - lea eax, [eax + ecx - 16] - convertloop: - movdqa xmm0, [eax] - lea eax, [eax - 16] - pshufb xmm0, xmm5 - movdqa [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - ja convertloop - ret - } -} - -#elif (defined(__i386__) || defined(__x86_64__)) && \ - !defined(YUV_DISABLE_ASM) -#define HAS_REVERSE_ROW_SSSE3 -static void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) { - intptr_t temp_width = static_cast(width); - asm volatile ( - "movdqa %3,%%xmm5 \n" - "lea -0x10(%0,%2,1),%0 \n" -"1: \n" - "movdqa (%0),%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleReverse) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm5" -#endif - ); -} -#endif - void RotatePlane180(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width, int height) { - int i; - reverse_func ReverseRow; - + void (*ReverseRow)(const uint8* src, uint8* dst, int width); #if defined(HAS_REVERSE_ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ReverseRow = ReverseRow_NEON; @@ -925,10 +859,11 @@ void RotatePlane180(const uint8* src, int src_stride, { ReverseRow = ReverseRow_C; } + // Rotate by 180 is a mirror and vertical flip src += src_stride * (height - 1); - for (i = 0; i < height; ++i) { + for (int y = 0; y < height; ++y) { ReverseRow(src, dst, width); src -= src_stride; dst += dst_stride; diff --git a/source/row.h b/source/row.h index ff7057615..28fb3647b 100644 --- a/source/row.h +++ b/source/row.h @@ -63,6 +63,12 @@ void FastConvertYUVToABGRRow_NEON(const uint8* y_buf, #define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3 #define HAS_FASTCONVERTYUVTOABGRROW_SSSE3 #define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3 +#define HAS_REVERSE_ROW_SSSE3 +#endif + +// The following are available on Neon platforms +#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) +#define HAS_REVERSE_ROW_NEON #endif extern "C" { @@ -89,6 +95,14 @@ void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); #endif +#ifdef HAS_REVERSE_ROW_SSSE3 +void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width); +#endif +#ifdef HAS_REVERSE_ROW_NEON +void ReverseRow_NEON(const uint8* src, uint8* dst, int width); +#endif +void ReverseRow_C(const uint8* src, uint8* dst, int width); + void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); diff --git a/source/row_common.cc b/source/row_common.cc index 252d1d763..f763a05ed 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -335,4 +335,12 @@ void FastConvertYToARGBRow_C(const uint8* y_buf, } } +void ReverseRow_C(const uint8* src, uint8* dst, int width) { + src += width - 1; + for (int i = 0; i < width; ++i) { + dst[i] = src[0]; + --src; + } +} + } // extern "C" diff --git a/source/row_posix.cc b/source/row_posix.cc index 89b8c6430..005efbb46 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -634,4 +634,36 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, } #endif +#ifdef HAS_REVERSE_ROW_SSSE3 + +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleReverse = { + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u +}; + +void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = static_cast(width); + asm volatile ( + "movdqa %3,%%xmm5 \n" + "lea -0x10(%0,%2,1),%0 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleReverse) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm5" +#endif + ); +} +#endif + } // extern "C" diff --git a/source/row_win.cc b/source/row_win.cc index 050c01afc..f150d0e4a 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -814,7 +814,34 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf, } } #endif +#endif +#ifdef HAS_REVERSE_ROW_SSSE3 + +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleReverse = { + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u +}; + +__declspec(naked) +void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) { +__asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + movdqa xmm5, kShuffleReverse + lea eax, [eax + ecx - 16] + convertloop: + movdqa xmm0, [eax] + lea eax, [eax - 16] + pshufb xmm0, xmm5 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja convertloop + ret + } +} #endif } // extern "C" diff --git a/source/scale.cc b/source/scale.cc index b6b8b432a..2f031685e 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -42,6 +42,8 @@ void SetUseReferenceImpl(bool use) { use_reference_impl_ = use; } +// ScaleRowDown2Int also used by planar functions + /** * NEON downscalers with interpolation. * @@ -624,8 +626,8 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, // Blends 32x2 rectangle to 16x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked) -static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr @@ -2778,8 +2780,8 @@ static void ScaleRowDown2_C(const uint8* src_ptr, int, } } -static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { for (int x = 0; x < dst_width; ++x) { *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2; @@ -3068,8 +3070,9 @@ static void ScalePlaneDown2(int src_width, int src_height, #endif #if defined(HAS_SCALEROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2) && - (dst_width % 16 == 0) && IS_ALIGNED(src_ptr, 16) && - IS_ALIGNED(dst_ptr, 16)) { + (dst_width % 16 == 0) && + IS_ALIGNED(src_ptr, 16) && (src_stride % 16 == 0) && + IS_ALIGNED(dst_ptr, 16) && (dst_stride % 16 == 0)) { ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2; } else #endif diff --git a/source/video_common.h b/source/video_common.h index 8d7d13287..839050de8 100644 --- a/source/video_common.h +++ b/source/video_common.h @@ -38,7 +38,9 @@ namespace libyuv { enum FourCC { // Canonical fourcc codes used in our code. FOURCC_I420 = FOURCC('I', '4', '2', '0'), + FOURCC_I422 = FOURCC('I', '4', '2', '2'), FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'), + FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'), FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), FOURCC_M420 = FOURCC('M', '4', '2', '0'), @@ -62,6 +64,7 @@ enum FourCC { // equivalents by CanonicalFourCC(). FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420 FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Alias for I420 + FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'), // Alias for I422 FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2 FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY