diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index c08011ef5..fa3b64463 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -16,93 +16,81 @@ namespace libyuv { -int -I420ToRGB24(const uint8* src_yplane, int src_ystride, - const uint8* src_uplane, int src_ustride, - const uint8* src_vplane, int src_vstride, - uint8* dst_frame, int dst_stride, - int src_width, int src_height); +int I420ToRGB24(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); -int -I420ToARGB4444(const uint8* src_yplane, int src_ystride, - const uint8* src_uplane, int src_ustride, - const uint8* src_vplane, int src_vstride, - uint8* dst_frame, int dst_stride, - int src_width, int src_height); +int I420ToARGB4444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); -int -I420ToRGB565(const uint8* src_yplane, int src_ystride, - const uint8* src_uplane, int src_ustride, - const uint8* src_vplane, int src_vstride, - uint8* dst_frame, int dst_stride, - int src_width, int src_height); +int I420ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); -int -I420ToARGB1555(const uint8* src_yplane, int src_ystride, - const uint8* src_uplane, int src_ustride, - const uint8* src_vplane, int src_vstride, - uint8* dst_frame, int dst_stride, - int src_width, int src_height); +int I420ToARGB1555(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); -int -I420ToYUY2(const uint8* src_yplane, int src_ystride, - const uint8* src_uplane, int src_ustride, - const uint8* src_vplane, int src_vstride, - uint8* dst_frame, int dst_stride, - int src_width, int src_height); +int I420ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); -int -I420ToUYVY(const uint8* src_yplane, int src_ystride, - const uint8* src_uplane, int src_ustride, - const uint8* src_vplane, int src_vstride, - uint8* dst_frame, int dst_stride, - int src_width, int src_height); +int I420ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); -int -RGB24ToARGB(const uint8* src_frame, int src_stride, - uint8* dst_frame, int dst_stride, - int src_width, int src_height); +// TODO(fbarchard): Deprecated - this is same as BG24ToARGB with -height +int RGB24ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_frame, int dst_stride_frame, + int width, int height); -int -RGB24ToI420(const uint8* src_frame, int src_stride, - uint8* dst_yplane, int dst_ystride, - uint8* dst_uplane, int dst_ustride, - uint8* dst_vplane, int dst_vstride, - int src_width, int src_height); +int RGB24ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); -int -RAWToI420(const uint8* src_frame, int src_stride, - uint8* dst_yplane, int dst_ystride, - uint8* dst_uplane, int dst_ustride, - uint8* dst_vplane, int dst_vstride, - int src_width, int src_height); +int RAWToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); -int -ABGRToI420(const uint8* src_frame, int src_stride, - uint8* dst_yplane, int dst_ystride, - uint8* dst_uplane, int dst_ustride, - uint8* dst_vplane, int dst_vstride, - int src_width, int src_height); +int ABGRToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); -int -BGRAToI420(const uint8* src_frame, int src_stride, - uint8* dst_yplane, int dst_ystride, - uint8* dst_uplane, int dst_ustride, - uint8* dst_vplane, int dst_vstride, - int src_width, int src_height); +int BGRAToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); -int -ARGBToI420(const uint8* src_frame, int src_stride, - uint8* dst_yplane, int dst_ystride, - uint8* dst_uplane, int dst_ustride, - uint8* dst_vplane, int dst_vstride, - int src_width, int src_height); +int ARGBToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); -int -NV12ToRGB565(const uint8* src_yplane, int src_ystride, - const uint8* src_uvplane, int src_uvstride, - uint8* dst_frame, int dst_stride, - int src_width, int src_height); +int NV12ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_frame, int dst_stride_frame, + int width, int height); } // namespace libyuv diff --git a/source/convert.cc b/source/convert.cc index 3d6dc99fe..ee7af0cca 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -10,8 +10,10 @@ #include "libyuv/convert.h" -#include "libyuv/basic_types.h" #include "conversion_tables.h" +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "row.h" //#define SCALEOPT //Currently for windows only. June 2010 @@ -30,29 +32,29 @@ static inline uint8 Clip(int32 val) { return (uint8) val; } -int I420ToRGB24(const uint8* src_yplane, int src_ystride, - const uint8* src_uplane, int src_ustride, - const uint8* src_vplane, int src_vstride, - uint8* dst_frame, int dst_stride, - int src_width, int src_height) -{ - if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL || - dst_frame == NULL) +int I420ToRGB24(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) { return -1; + } // RGB orientation - bottom up - uint8* out = dst_frame + dst_stride * src_height - dst_stride; - uint8* out2 = out - dst_stride; + // TODO(fbarchard): support inversion + uint8* out = dst_frame + dst_stride_frame * height - dst_stride_frame; + uint8* out2 = out - dst_stride_frame; int h, w; int tmp_r, tmp_g, tmp_b; const uint8 *y1, *y2 ,*u, *v; - y1 = src_yplane; - y2 = y1 + src_ystride; - u = src_uplane; - v = src_vplane; - for (h = ((src_height + 1) >> 1); h > 0; h--){ + y1 = src_y; + y2 = y1 + src_stride_y; + u = src_u; + v = src_v; + for (h = ((height + 1) >> 1); h > 0; h--){ // 2 rows at a time, 2 y's at a time - for (w = 0; w < ((src_width + 1) >> 1); w++){ + for (w = 0; w < ((width + 1) >> 1); w++){ // Vertical and horizontal sub-sampling tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8); tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); @@ -89,41 +91,40 @@ int I420ToRGB24(const uint8* src_yplane, int src_ystride, u++; v++; } - y1 += src_ystride + src_ystride - src_width; - y2 += src_ystride + src_ystride - src_width; - u += src_ustride - ((src_width + 1) >> 1); - v += src_vstride - ((src_width + 1) >> 1); - out -= dst_stride * 3; - out2 -= dst_stride * 3; + y1 += src_stride_y + src_stride_y - width; + y2 += src_stride_y + src_stride_y - width; + u += src_stride_u - ((width + 1) >> 1); + v += src_stride_v - ((width + 1) >> 1); + out -= dst_stride_frame * 3; + out2 -= dst_stride_frame * 3; } // end height for return 0; } // Little Endian... -int I420ToARGB4444(const uint8* src_yplane, int src_ystride, - const uint8* src_uplane, int src_ustride, - const uint8* src_vplane, int src_vstride, - uint8* dst_frame, int dst_stride, - int src_width, int src_height) -{ - if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL || - dst_frame == NULL) +int I420ToARGB4444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) { return -1; + } // RGB orientation - bottom up - uint8* out = dst_frame + dst_stride * (src_height - 1); - uint8* out2 = out - dst_stride; + uint8* out = dst_frame + dst_stride_frame * (height - 1); + uint8* out2 = out - dst_stride_frame; int tmp_r, tmp_g, tmp_b; const uint8 *y1,*y2, *u, *v; - y1 = src_yplane; - y2 = y1 + src_ystride; - u = src_uplane; - v = src_vplane; + y1 = src_y; + y2 = y1 + src_stride_y; + u = src_u; + v = src_v; int h, w; - for (h = ((src_height + 1) >> 1); h > 0; h--){ + for (h = ((height + 1) >> 1); h > 0; h--) { // 2 rows at a time, 2 y's at a time - for (w = 0; w < ((src_width + 1) >> 1); w++){ + for (w = 0; w < ((width + 1) >> 1); w++) { // Vertical and horizontal sub-sampling // Convert to RGB888 and re-scale to 4 bits tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8); @@ -157,51 +158,50 @@ int I420ToARGB4444(const uint8* src_yplane, int src_ystride, u++; v++; } - y1 += 2 * src_ystride - src_width; - y2 += 2 * src_ystride - src_width; - u += src_ustride - ((src_width + 1) >> 1); - v += src_vstride - ((src_width + 1) >> 1); - out -= (dst_stride + src_width) * 2; - out2 -= (dst_stride + src_width) * 2; + y1 += 2 * src_stride_y - width; + y2 += 2 * src_stride_y - width; + u += src_stride_u - ((width + 1) >> 1); + v += src_stride_v - ((width + 1) >> 1); + out -= (dst_stride_frame + width) * 2; + out2 -= (dst_stride_frame + width) * 2; } // end height for return 0; } -int I420ToRGB565(const uint8* src_yplane, int src_ystride, - const uint8* src_uplane, int src_ustride, - const uint8* src_vplane, int src_vstride, - uint8* dst_frame, int dst_stride, - int src_width, int src_height) -{ - if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL || - dst_frame == NULL) +int I420ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) { return -1; + } // Negative height means invert the image. - if (src_height < 0) { - src_height = -src_height; - src_yplane = src_yplane + (src_height - 1) * src_ystride; - src_uplane = src_uplane + (src_height - 1) * src_ustride; - src_vplane = src_vplane + (src_height - 1) * src_vstride; - src_ystride = -src_ystride; - src_ustride = -src_ustride; - src_vstride = -src_vstride; + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; } - uint16* out = (uint16*)(dst_frame) + dst_stride * (src_height - 1); - uint16* out2 = out - dst_stride; + uint16* out = (uint16*)(dst_frame) + dst_stride_frame * (height - 1); + uint16* out2 = out - dst_stride_frame; int tmp_r, tmp_g, tmp_b; - const uint8 *y1,*y2, *u, *v; - y1 = src_yplane; - y2 = y1 + src_ystride; - u = src_uplane; - v = src_vplane; + const uint8* y1,* y2, * u, * v; + y1 = src_y; + y2 = y1 + src_stride_y; + u = src_u; + v = src_v; int h, w; - for (h = ((src_height + 1) >> 1); h > 0; h--){ + for (h = ((height + 1) >> 1); h > 0; h--){ // 2 rows at a time, 2 y's at a time - for (w = 0; w < ((src_width + 1) >> 1); w++){ + for (w = 0; w < ((width + 1) >> 1); w++){ // Vertical and horizontal sub-sampling // 1. Convert to RGB888 // 2. Shift to adequate location (in the 16 bit word) - RGB 565 @@ -237,41 +237,39 @@ int I420ToRGB565(const uint8* src_yplane, int src_ystride, u++; v++; } - y1 += 2 * src_ystride - src_width; - y2 += 2 * src_ystride - src_width; - u += src_ustride - ((src_width + 1) >> 1); - v += src_vstride - ((src_width + 1) >> 1); - out -= 2 * dst_stride + src_width; - out2 -= 2 * dst_stride + src_width; + y1 += 2 * src_stride_y - width; + y2 += 2 * src_stride_y - width; + u += src_stride_u - ((width + 1) >> 1); + v += src_stride_v - ((width + 1) >> 1); + out -= 2 * dst_stride_frame + width; + out2 -= 2 * dst_stride_frame + width; } return 0; } -int I420ToARGB1555(const uint8* src_yplane, int src_ystride, - const uint8* src_uplane, int src_ustride, - const uint8* src_vplane, int src_vstride, - uint8* dst_frame, int dst_stride, - int src_width, int src_height) -{ - if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL || - dst_frame == NULL){ - return -1; +int I420ToARGB1555(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) { + return -1; } - uint16* out = (uint16*)(dst_frame) + dst_stride * (src_height - 1); - uint16* out2 = out - dst_stride ; + uint16* out = (uint16*)(dst_frame) + dst_stride_frame * (height - 1); + uint16* out2 = out - dst_stride_frame ; int32 tmp_r, tmp_g, tmp_b; const uint8 *y1,*y2, *u, *v; int h, w; - y1 = src_yplane; - y2 = y1 + src_ystride; - u = src_uplane; - v = src_vplane; + y1 = src_y; + y2 = y1 + src_stride_y; + u = src_u; + v = src_v; - for (h = ((src_height + 1) >> 1); h > 0; h--){ + for (h = ((height + 1) >> 1); h > 0; h--){ // 2 rows at a time, 2 y's at a time - for (w = 0; w < ((src_width + 1) >> 1); w++){ + for (w = 0; w < ((width + 1) >> 1); w++){ // Vertical and horizontal sub-sampling // 1. Convert to RGB888 // 2. Shift to adequate location (in the 16 bit word) - RGB 555 @@ -307,41 +305,37 @@ int I420ToARGB1555(const uint8* src_yplane, int src_ystride, u++; v++; } - y1 += 2 * src_ystride - src_width; - y2 += 2 * src_ystride - src_width; - u += src_ustride - ((src_width + 1) >> 1); - v += src_vstride - ((src_width + 1) >> 1); - out -= 2 * dst_stride + src_width; - out2 -= 2 * dst_stride + src_width; + y1 += 2 * src_stride_y - width; + y2 += 2 * src_stride_y - width; + u += src_stride_u - ((width + 1) >> 1); + v += src_stride_v - ((width + 1) >> 1); + out -= 2 * dst_stride_frame + width; + out2 -= 2 * dst_stride_frame + width; } return 0; } -int I420ToYUY2(const uint8* src_yplane, int src_ystride, - const uint8* src_uplane, int src_ustride, - const uint8* src_vplane, int src_vstride, - uint8* dst_frame, int dst_stride, - int src_width, int src_height) -{ - if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL || - dst_frame == NULL){ +int I420ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) { return -1; } - const uint8* in1 = src_yplane; - const uint8* in2 = src_yplane + src_ystride ; - const uint8* src_u = src_uplane; - const uint8* src_v = src_vplane; + const uint8* in1 = src_y; + const uint8* in2 = src_y + src_stride_y; uint8* out1 = dst_frame; - uint8* out2 = dst_frame + dst_stride; + uint8* out2 = dst_frame + dst_stride_frame; // YUY2 - Macro-pixel = 2 image pixels // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... #ifndef SCALEOPT - for (int i = 0; i < ((src_height + 1) >> 1); i++){ - for (int j = 0; j < ((src_width + 1) >> 1); j++){ + for (int i = 0; i < ((height + 1) >> 1); i++){ + for (int j = 0; j < ((width + 1) >> 1); j++){ out1[0] = in1[0]; out1[1] = *src_u; out1[2] = in1[1]; @@ -358,16 +352,15 @@ int I420ToYUY2(const uint8* src_yplane, int src_ystride, in1 += 2; in2 += 2; } - in1 += 2 * src_ystride - src_width; - in2 += 2 * src_ystride - src_width; - src_u += src_ustride - ((src_width + 1) >> 1); - src_v += src_vstride - ((src_width + 1) >> 1); - out1 += dst_stride + dst_stride - 2 * src_width; - out2 += dst_stride + dst_stride - 2 * src_width; + in1 += 2 * src_stride_y - width; + in2 += 2 * src_stride_y - width; + src_u += src_stride_u - ((width + 1) >> 1); + src_v += src_stride_v - ((width + 1) >> 1); + out1 += dst_stride_frame + dst_stride_frame - 2 * width; + out2 += dst_stride_frame + dst_stride_frame - 2 * width; } #else - for (WebRtc_UWord32 i = 0; i < ((height + 1) >> 1);i++) - { + for (WebRtc_UWord32 i = 0; i < ((height + 1) >> 1);i++) { int32 width__ = (width >> 4); _asm { @@ -424,40 +417,39 @@ int I420ToYUY2(const uint8* src_yplane, int src_ystride, ;popa emms } - in1 += 2 * src_ystride - src_width; - in2 += 2 * src_ystride - src_width; - out1 += dst_stride + dst_stride - 2 * width; - out2 += dst_stride + dst_stride - 2 * width; + in1 += 2 * src_stride_y - width; + in2 += 2 * src_stride_y - width; + out1 += dst_stride_frame + dst_stride_frame - 2 * width; + out2 += dst_stride_frame + dst_stride_frame - 2 * width; } #endif return 0; } -int I420ToUYVY(const uint8* src_yplane, int src_ystride, - const uint8* src_uplane, int src_ustride, - const uint8* src_vplane, int src_vstride, - uint8* dst_frame, int dst_stride, - int src_width, int src_height) -{ - if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL || - dst_frame == NULL) +int I420ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) { return -1; + } int i = 0; - const uint8* y1 = src_yplane; - const uint8* y2 = y1 + src_ystride; - const uint8* u = src_uplane; - const uint8* v = src_vplane; + const uint8* y1 = src_y; + const uint8* y2 = y1 + src_stride_y; + const uint8* u = src_u; + const uint8* v = src_v; uint8* out1 = dst_frame; - uint8* out2 = dst_frame + dst_stride; + uint8* out2 = dst_frame + dst_stride_frame; // Macro-pixel = 2 image pixels // U0Y0V0Y1....U2Y2V2Y3...U4Y4V4Y5..... #ifndef SCALEOPT - for (; i < ((src_height + 1) >> 1);i++){ - for (int j = 0; j < ((src_width + 1) >> 1) ;j++){ + for (; i < ((height + 1) >> 1); i++) { + for (int j = 0; j < ((width + 1) >> 1); j++) { out1[0] = *u; out1[1] = y1[0]; out1[2] = *v; @@ -474,16 +466,15 @@ int I420ToUYVY(const uint8* src_yplane, int src_ystride, y1 += 2; y2 += 2; } - y1 += 2 * src_ystride - src_width; - y2 += 2 * src_ystride - src_width; - u += src_ustride - ((src_width + 1) >> 1); - v += src_vstride - ((src_width + 1) >> 1); - out1 += 2 * (dst_stride - src_width); - out2 += 2 * (dst_stride - src_width); + y1 += 2 * src_stride_y - width; + y2 += 2 * src_stride_y - width; + u += src_stride_u - ((width + 1) >> 1); + v += src_stride_v - ((width + 1) >> 1); + out1 += 2 * (dst_stride_frame - width); + out2 += 2 * (dst_stride_frame - width); } #else - for (; i < (height >> 1);i++) - { + for (; i < (height >> 1);i++) { int32 width__ = (width >> 4); _asm { @@ -540,35 +531,35 @@ loop0: } in1 += width; in2 += width; - out1 += 2 * (dst_stride - width); - out2 += 2 * (dst_stride - width); + out1 += 2 * (dst_stride_frame - width); + out2 += 2 * (dst_stride_frame - width); } #endif return 0; } -int NV12ToRGB565(const uint8* src_yplane, int src_ystride, - const uint8* src_uvplane, int src_uvstride, - uint8* dst_frame, int dst_stride, - int src_width, int src_height) -{ - if (src_yplane == NULL || src_uvplane == NULL || dst_frame == NULL) - return -1; +int NV12ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (src_y == NULL || src_uv == NULL || dst_frame == NULL) { + return -1; + } // Bi-Planar: Y plane followed by an interlaced U and V plane - const uint8* interlacedSrc = src_uvplane; - uint16* out = (uint16*)(src_yplane) + dst_stride * (src_height - 1); - uint16* out2 = out - dst_stride; + const uint8* interlacedSrc = src_uv; + uint16* out = (uint16*)(src_y) + dst_stride_frame * (height - 1); + uint16* out2 = out - dst_stride_frame; int32 tmp_r, tmp_g, tmp_b; const uint8 *y1,*y2; - y1 = src_yplane; - y2 = y1 + src_ystride; + y1 = src_y; + y2 = y1 + src_stride_y; int h, w; - for (h = ((src_height + 1) >> 1); h > 0; h--){ + for (h = ((height + 1) >> 1); h > 0; h--) { // 2 rows at a time, 2 y's at a time - for (w = 0; w < ((src_width + 1) >> 1); w++){ + for (w = 0; w < ((width + 1) >> 1); w++) { // Vertical and horizontal sub-sampling // 1. Convert to RGB888 // 2. Shift to adequate location (in the 16 bit word) - RGB 565 @@ -608,29 +599,30 @@ int NV12ToRGB565(const uint8* src_yplane, int src_ystride, out2 += 2; interlacedSrc += 2; } - y1 += 2 * src_ystride - src_width; - y2 += 2 * src_ystride - src_width; - interlacedSrc += src_uvstride - ((src_width + 1) >> 1); - out -= 3 * dst_stride + dst_stride - src_width; - out2 -= 3 * dst_stride + dst_stride - src_width; + y1 += 2 * src_stride_y - width; + y2 += 2 * src_stride_y - width; + interlacedSrc += src_stride_uv - ((width + 1) >> 1); + out -= 3 * dst_stride_frame + dst_stride_frame - width; + out2 -= 3 * dst_stride_frame + dst_stride_frame - width; } return 0; } -int RGB24ToARGB(const uint8* src_frame, int src_stride, - uint8* dst_frame, int dst_stride, - int src_width, int src_height) -{ - if (src_frame == NULL || dst_frame == NULL) +// TODO(fbarchard): Deprecated - this is same as BG24ToARGB with -height +int RGB24ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (src_frame == NULL || dst_frame == NULL) { return -1; + } int i, j, offset; uint8* outFrame = dst_frame; const uint8* inFrame = src_frame; - outFrame += dst_stride * (src_height - 1) * 4; - for (i = 0; i < src_height; i++){ - for (j = 0; j < src_width; j++){ + outFrame += dst_stride_frame * (height - 1) * 4; + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { offset = j * 4; outFrame[0 + offset] = inFrame[0]; outFrame[1 + offset] = inFrame[1]; @@ -638,8 +630,8 @@ int RGB24ToARGB(const uint8* src_frame, int src_stride, outFrame[3 + offset] = 0xff; inFrame += 3; } - outFrame -= 4 * (dst_stride - src_width); - inFrame += src_stride - src_width; + outFrame -= 4 * (dst_stride_frame - width); + inFrame += src_stride_frame - width; } return 0; } @@ -654,10 +646,10 @@ int RGB24ToARGB(const uint8* src_frame, int src_stride, static void \ NAME(const uint8* src_row0, const uint8* src_row1, \ uint8* dst_yplane0, uint8* dst_yplane1, \ - uint8* dst_uplane, \ - uint8* dst_vplane, \ - int src_width) { \ - for (int x = 0; x < src_width - 1; x += 2) { \ + uint8* dst_u, \ + uint8* dst_v, \ + int width) { \ + for (int x = 0; x < width - 1; x += 2) { \ dst_yplane0[0] = (uint8)((src_row0[R] * 66 + \ src_row0[G] * 129 + \ src_row0[B] * 25 + 128) >> 8) + 16; \ @@ -670,14 +662,14 @@ NAME(const uint8* src_row0, const uint8* src_row1, \ dst_yplane1[1] = (uint8)((src_row1[R + BPP] * 66 + \ src_row1[G + BPP] * 129 + \ src_row1[B + BPP] * 25 + 128) >> 8) + 16; \ - dst_uplane[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \ + dst_u[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \ src_row1[R] + src_row1[R + BPP]) * -38 + \ (src_row0[G] + src_row0[G + BPP] + \ src_row1[G] + src_row1[G + BPP]) * -74 + \ (src_row0[B] + src_row0[B + BPP] + \ src_row1[B] + src_row1[B + BPP]) * 112 + \ + 512) >> 10) + 128; \ - dst_vplane[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \ + dst_v[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \ src_row1[R] + src_row1[R + BPP]) * 112 + \ (src_row0[G] + src_row0[G + BPP] + \ src_row1[G] + src_row1[G + BPP]) * -94 + \ @@ -686,26 +678,26 @@ NAME(const uint8* src_row0, const uint8* src_row1, \ + 512) >> 10) + 128; \ dst_yplane0 += 2; \ dst_yplane1 += 2; \ - ++dst_uplane; \ - ++dst_vplane; \ + ++dst_u; \ + ++dst_v; \ src_row0 += BPP * 2; \ src_row1 += BPP * 2; \ } \ - if (src_width & 1) { \ + if (width & 1) { \ dst_yplane0[0] = (uint8)((src_row0[R] * 66 + \ src_row0[G] * 129 + \ src_row0[B] * 25 + 128) >> 8) + 16; \ dst_yplane1[0] = (uint8)((src_row1[R] * 66 + \ src_row1[G] * 129 + \ src_row1[B] * 25 + 128) >> 8) + 16; \ - dst_uplane[0] = (uint8)(((src_row0[R] + \ + dst_u[0] = (uint8)(((src_row0[R] + \ src_row1[R]) * -38 + \ (src_row0[G] + \ src_row1[G]) * -74 + \ (src_row0[B] + \ src_row1[B]) * 112 + \ + 256) >> 9) + 128; \ - dst_vplane[0] = (uint8)(((src_row0[R] + \ + dst_v[0] = (uint8)(((src_row0[R] + \ src_row1[R]) * 112 + \ (src_row0[G] + \ src_row1[G]) * -94 + \ @@ -723,104 +715,157 @@ MAKEROWRGBTOI420(ABGRToI420Row_C, 0, 1, 2, 4) MAKEROWRGBTOI420(RGB24ToI420Row_C, 2, 1, 0, 3) MAKEROWRGBTOI420(RAWToI420Row_C, 0, 1, 2, 3) -static int RGBToI420(const uint8* src_frame, int src_stride, - uint8* dst_yplane, int dst_ystride, - uint8* dst_uplane, int dst_ustride, - uint8* dst_vplane, int dst_vstride, - int src_width, int src_height, +static int RGBToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, void (*RGBToI420Row)(const uint8* src_row0, const uint8* src_row1, uint8* dst_yplane0, uint8* dst_yplane1, - uint8* dst_uplane, - uint8* dst_vplane, - int src_width)) { - if (src_frame == NULL || dst_yplane == NULL || - dst_vplane == NULL || dst_vplane == NULL) + uint8* dst_u, + uint8* dst_v, + int width)) { + if (src_frame == NULL || dst_y == NULL || + dst_v == NULL || dst_v == NULL) return -1; - if (src_height < 0) { - src_height = -src_height; - src_frame = src_frame + src_stride * (src_height -1); - src_stride = -src_stride; + if (height < 0) { + height = -height; + src_frame = src_frame + src_stride_frame * (height -1); + src_stride_frame = -src_stride_frame; } - for (int y = 0; y < src_height - 1; y += 2) { - RGBToI420Row(src_frame, src_frame + src_stride, - dst_yplane, dst_yplane + dst_ystride, - dst_uplane, dst_vplane, - src_width); - src_frame += src_stride * 2; - dst_yplane += dst_ystride * 2; - dst_uplane += dst_ustride; - dst_vplane += dst_vstride; + for (int y = 0; y < height - 1; y += 2) { + RGBToI420Row(src_frame, src_frame + src_stride_frame, + dst_y, dst_y + dst_stride_y, + dst_u, dst_v, + width); + src_frame += src_stride_frame * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; } - if (src_height & 1) { + if (height & 1) { RGBToI420Row(src_frame, src_frame, - dst_yplane, dst_yplane, - dst_uplane, dst_vplane, - src_width); + dst_y, dst_y, + dst_u, dst_v, + width); } return 0; } -int ARGBToI420(const uint8* src_frame, int src_stride, - uint8* dst_yplane, int dst_ystride, - uint8* dst_uplane, int dst_ustride, - uint8* dst_vplane, int dst_vstride, - int src_width, int src_height) { - return RGBToI420(src_frame, src_stride, - dst_yplane, dst_ystride, - dst_uplane, dst_ustride, - dst_vplane, dst_vstride, - src_width, src_height, ARGBToI420Row_C); +int ARGBToI420_Reference(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return RGBToI420(src_frame, src_stride_frame, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, ARGBToI420Row_C); } -int BGRAToI420(const uint8* src_frame, int src_stride, - uint8* dst_yplane, int dst_ystride, - uint8* dst_uplane, int dst_ustride, - uint8* dst_vplane, int dst_vstride, - int src_width, int src_height) { - return RGBToI420(src_frame, src_stride, - dst_yplane, dst_ystride, - dst_uplane, dst_ustride, - dst_vplane, dst_vstride, - src_width, src_height, BGRAToI420Row_C); +int BGRAToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return RGBToI420(src_frame, src_stride_frame, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, BGRAToI420Row_C); } -int ABGRToI420(const uint8* src_frame, int src_stride, - uint8* dst_yplane, int dst_ystride, - uint8* dst_uplane, int dst_ustride, - uint8* dst_vplane, int dst_vstride, - int src_width, int src_height) { - return RGBToI420(src_frame, src_stride, - dst_yplane, dst_ystride, - dst_uplane, dst_ustride, - dst_vplane, dst_vstride, - src_width, src_height, ABGRToI420Row_C); +int ABGRToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return RGBToI420(src_frame, src_stride_frame, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, ABGRToI420Row_C); } -int RGB24ToI420(const uint8* src_frame, int src_stride, - uint8* dst_yplane, int dst_ystride, - uint8* dst_uplane, int dst_ustride, - uint8* dst_vplane, int dst_vstride, - int src_width, int src_height) { - return RGBToI420(src_frame, src_stride, - dst_yplane, dst_ystride, - dst_uplane, dst_ustride, - dst_vplane, dst_vstride, - src_width, src_height, RGB24ToI420Row_C); +int RGB24ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return RGBToI420(src_frame, src_stride_frame, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, RGB24ToI420Row_C); } -int RAWToI420(const uint8* src_frame, int src_stride, - uint8* dst_yplane, int dst_ystride, - uint8* dst_uplane, int dst_ustride, - uint8* dst_vplane, int dst_vstride, - int src_width, int src_height) { - return RGBToI420(src_frame, src_stride, - dst_yplane, dst_ystride, - dst_uplane, dst_ustride, - dst_vplane, dst_vstride, - src_width, src_height, RAWToI420Row_C); +int RAWToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return RGBToI420(src_frame, src_stride_frame, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, RAWToI420Row_C); +} + +int ARGBToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (height < 0) { + height = -height; + src_frame = src_frame + (height - 1) * src_stride_frame; + src_stride_frame = -src_stride_frame; + } + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#if defined(HAS_ARGBTOYROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 8 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_y, 8) && (dst_stride_y % 8 == 0)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } else +#endif + { + ARGBToYRow = ARGBToYRow_C; + } +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 8 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_u, 4) && (dst_stride_u % 4 == 0) && + IS_ALIGNED(dst_v, 4) && (dst_stride_v % 4 == 0)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } else +#endif + { + ARGBToUVRow = ARGBToUVRow_C; + } + + for (int y = 0; y < (height - 1); y += 2) { + ARGBToYRow(src_frame, dst_y, width); + ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width); + ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width); + src_frame += src_stride_frame * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToYRow(src_frame, dst_y, width); + ARGBToUVRow(src_frame, 0, dst_u, dst_v, width); + } + return 0; } } // namespace libyuv diff --git a/source/format_conversion.cc b/source/format_conversion.cc index 72088dbc0..db106bd4d 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -12,21 +12,10 @@ #include "libyuv/cpu_id.h" #include "video_common.h" +#include "row.h" namespace libyuv { -// Most code in here is inspired by the material at -// http://www.siliconimaging.com/RGB%20Bayer.htm - -// Forces compiler to inline, even against its better judgement. Use wisely. -#if defined(__GNUC__) -#define FORCE_INLINE __attribute__((always_inline)) -#elif defined(WIN32) -#define FORCE_INLINE __forceinline -#else -#define FORCE_INLINE -#endif - // Note: to do this with Neon vld4.8 would load ARGB values into 4 registers // and vst would select which 2 components to write. The low level would need // to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR @@ -333,46 +322,6 @@ int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer, return 0; } -// Taken from http://en.wikipedia.org/wiki/YUV -static FORCE_INLINE int RGBToY(uint8 r, uint8 g, uint8 b) { - return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16; -} - -static FORCE_INLINE int RGBToU(uint8 r, uint8 g, uint8 b) { - return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; -} -static FORCE_INLINE int RGBToV(uint8 r, uint8 g, uint8 b) { - return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; -} - -static void ARGBtoYRow(const uint8* src_argb0, - uint8* dst_y, int width) { - for (int x = 0; x < width; ++x) { - dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]); - src_argb0 += 4; - dst_y += 1; - } -} - -static void ARGBtoUVRow(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, - uint8* dst_v, - int width) { - const uint8* src_argb1 = src_argb0 + src_stride_argb; - for (int x = 0; x < width - 1; x += 2) { - uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2; - uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2; - uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - src_argb0 += 8; - src_argb1 += 8; - dst_u += 1; - dst_v += 1; - } -} - - // Converts any Bayer RGB format to ARGB. int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, uint32 src_fourcc_bayer, @@ -395,6 +344,28 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, uint8* dst_rgb, int pix); void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer, uint8* dst_rgb, int pix); + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#define kMaxStride (2048 * 4) + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); +#if defined(HAS_ARGBTOYROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 8 == 0) && + IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) && + IS_ALIGNED(dst_y, 8) && (dst_stride_y % 8 == 0)) { + ARGBToYRow = ARGBToYRow_SSSE3; +#if defined(HAS_ARGBTOUVROW_SSSE3) + ARGBToUVRow = ARGBToUVRow_SSSE3; +#else + ARGBToUVRow = ARGBToUVRow_C; +#endif + } else +#endif + { + ARGBToYRow = ARGBToYRow_C; + ARGBToUVRow = ARGBToUVRow_C; + } switch (src_fourcc_bayer) { default: @@ -417,24 +388,23 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, break; } -#define kMaxStride 2048 * 4 - uint8 row[kMaxStride * 2]; for (int y = 0; y < (height - 1); y += 2) { BayerRow0(src_bayer, src_stride_bayer, row, width); BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer, row + kMaxStride, width); - ARGBtoYRow(row, dst_y, width); - ARGBtoYRow(row + kMaxStride, dst_y + dst_stride_y, width); - ARGBtoUVRow(row, kMaxStride, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width); + ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width); src_bayer += src_stride_bayer * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } + // TODO(fbarchard): Make sure this filters properly if (height & 1) { BayerRow0(src_bayer, src_stride_bayer, row, width); - ARGBtoYRow(row, dst_y, width); - ARGBtoUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); } return 0; } diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 4ee19db8a..b7984c086 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -68,15 +68,6 @@ extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u }; -// Constant multiplication table for converting ARGB to I400. -extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = { - 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u -}; - -extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400_2[16]) = { - 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u -}; - #if defined(WIN32) && !defined(COVERAGE_ENABLED) #define HAS_SPLITUV_SSE2 __declspec(naked) @@ -215,7 +206,7 @@ int I420Copy(const uint8* src_y, int src_stride_y, static void SetRow32_NEON(uint8* dst, uint32 v32, int count) { __asm__ volatile ( - "vdup.u32 q0, %2 \n" // duplicate 4 ints + "vdup.u32 {q0}, %2 \n" // duplicate 4 ints "1:\n" "vst1.u32 {q0}, [%0]! \n" // store "subs %1, %1, #16 \n" // 16 processed per loop @@ -393,16 +384,16 @@ int I422ToI420(const uint8* src_y, int src_stride_y, } static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, - uint8* dst, int dst_stride, + uint8* dst, int dst_stride_frame, int width, int height) { // Copy plane for (int y = 0; y < height; y += 2) { memcpy(dst, src, width); src += src_stride_0; - dst += dst_stride; + dst += dst_stride_frame; memcpy(dst, src, width); src += src_stride_1; - dst += dst_stride; + dst += dst_stride_frame; } } @@ -503,13 +494,13 @@ int NV12ToI420(const uint8* src_y, int src_stride_y, // Convert NV12 to I420. Deprecated. int NV12ToI420(const uint8* src_y, const uint8* src_uv, - int src_stride, + int src_stride_frame, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height) { - return X420ToI420(src_y, src_stride, src_stride, - src_uv, src_stride, + return X420ToI420(src_y, src_stride_frame, src_stride_frame, + src_uv, src_stride_frame, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, @@ -1371,38 +1362,6 @@ __asm { } } -#define HAS_ARGBTOI400ROW_SSSE3 -__declspec(naked) -static void ARGBToI400Row_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { -__asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // pix - movdqa xmm7, _kMultiplyMaskARGBToI400 - movdqa xmm6, _kMultiplyMaskARGBToI400_2 - movdqa xmm5, xmm6 - psllw xmm5, 4 // Generate a mask of 0x10 on each byte. - - convertloop : - movdqa xmm0, [eax] - pmaddubsw xmm0, xmm7 - movdqa xmm1, [eax + 16] - psrlw xmm0, 7 - pmaddubsw xmm1, xmm7 - lea eax, [eax + 32] - psrlw xmm1, 7 - packuswb xmm0, xmm1 - pmaddubsw xmm0, xmm6 - packuswb xmm0, xmm0 - paddb xmm0, xmm5 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 8 - ja convertloop - ret - } -} - #elif (defined(__x86_64__) || defined(__i386__)) && \ !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) @@ -1554,39 +1513,6 @@ static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, ); } -#define HAS_ARGBTOI400ROW_SSSE3 -static void ARGBToI400Row_SSSE3(const uint8* src_argb, uint8* dst_y, - int pix) { - asm volatile( - "movdqa (%3),%%xmm7\n" - "movdqa (%4),%%xmm6\n" - "movdqa %%xmm6,%%xmm5\n" - "psllw $0x4,%%xmm5\n" // Generate a mask of 0x10 on each byte. -"1:" - "movdqa (%0),%%xmm0\n" - "pmaddubsw %%xmm7,%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "psrlw $0x7,%%xmm0\n" - "pmaddubsw %%xmm7,%%xmm1\n" - "lea 0x20(%0),%0\n" - "psrlw $0x7,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "pmaddubsw %%xmm6,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "paddb %%xmm5,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "lea 0x8(%1),%1\n" - "sub $0x8,%2\n" - "ja 1b\n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : "r"(kMultiplyMaskARGBToI400), // %3 - "r"(kMultiplyMaskARGBToI400_2) // %4 - : "memory" -); -} - #endif static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) { @@ -1812,16 +1738,6 @@ int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, return 0; } -static void ARGBToI400Row_C(const uint8* src_argb, uint8* dst_y, int pix) { - for (int x = 0; x < pix; ++x) { - uint32 b = static_cast(src_argb[0] * 13u); - uint32 g = static_cast(src_argb[1] * 64u); - uint32 r = static_cast(src_argb[2] * 33u); - *(dst_y++) = static_cast(((b + g + r) >> 7) + 16u); - src_argb += 4; - } -} - // Convert ARGB to I400. int ARGBToI400(const uint8* src_argb, int src_stride_argb, uint8* dst_y, int dst_stride_y, @@ -1831,21 +1747,21 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } -void (*ARGBToI400Row)(const uint8* src_argb, uint8* dst_y, int pix); -#if defined(HAS_ARGBTOI400ROW_SSSE3) +void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); +#if defined(HAS_ARGBTOYROW_SSSE3) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && (width % 4 == 0) && IS_ALIGNED(src_argb, 16) && (src_stride_argb % 16 == 0) && IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { - ARGBToI400Row = ARGBToI400Row_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; } else #endif { - ARGBToI400Row = ARGBToI400Row_C; + ARGBToYRow = ARGBToYRow_C; } for (int y = 0; y < height; ++y) { - ARGBToI400Row(src_argb, dst_y, width); + ARGBToYRow(src_argb, dst_y, width); src_argb += src_stride_argb; dst_y += dst_stride_y; } diff --git a/source/row.h b/source/row.h index a11d80251..1563e95e3 100644 --- a/source/row.h +++ b/source/row.h @@ -13,6 +13,16 @@ #include "libyuv/basic_types.h" +#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \ + && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#define HAS_ARGBTOYROW_SSSE3 +#endif + +#if defined(WIN32) \ + && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#define HAS_ARGBTOUVROW_SSSE3 +#endif + extern "C" { void FastConvertYUVToRGB32Row(const uint8* y_buf, const uint8* u_buf, @@ -42,11 +52,24 @@ void FastConvertYToRGB32Row(const uint8* y_buf, uint8* rgb_buf, int width); +#ifdef HAS_ARGBTOYROW_SSSE3 +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#endif +void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); + + #if defined(_MSC_VER) #define SIMD_ALIGNED(var) __declspec(align(16)) var +#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var #else #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +#define TALIGN16(t, var) t var __attribute__((aligned(16))) #endif + #ifdef OSX extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]); extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]); diff --git a/source/row_posix.cc b/source/row_posix.cc index 02ddc1209..40e636cc2 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -12,6 +12,91 @@ extern "C" { +#ifdef HAS_ARGBTOYROW_SSSE3 + +// Constant multiplication table for converting ARGB to I400. +extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = { + 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u +}; + +extern "C" TALIGN16(const uint8, kAdd16[16]) = { + 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u +}; + +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile( + "movdqa (%3),%%xmm7\n" + "movdqa (%4),%%xmm6\n" + "movdqa %%xmm6,%%xmm5\n" + "psllw $0x4,%%xmm5\n" // Generate a mask of 0x10 on each byte. +"1:" + "movdqa (%0),%%xmm0\n" + "pmaddubsw %%xmm7,%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "psrlw $0x7,%%xmm0\n" + "pmaddubsw %%xmm7,%%xmm1\n" + "lea 0x20(%0),%0\n" + "psrlw $0x7,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "pmaddubsw %%xmm6,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "paddb %%xmm5,%%xmm0\n" + "movq %%xmm0,(%1)\n" + "lea 0x8(%1),%1\n" + "sub $0x8,%2\n" + "ja 1b\n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "r"(kMultiplyMaskARGBToI400), // %3 + "r"(kAdd16) // %4 + : "memory" +); +} +#endif + +static inline int RGBToY(uint8 r, uint8 g, uint8 b) { + return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16; +} + +static inline int RGBToU(uint8 r, uint8 g, uint8 b) { + return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; +} +static inline int RGBToV(uint8 r, uint8 g, uint8 b) { + return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; +} + +void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { + for (int x = 0; x < width; ++x) { + dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]); + src_argb0 += 4; + dst_y += 1; + } +} + +void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + const uint8* src_argb1 = src_argb0 + src_stride_argb; + for (int x = 0; x < width - 1; x += 2) { + uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2; + uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2; + uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + src_argb0 += 8; + src_argb1 += 8; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1; + uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1; + uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + } +} + #if defined(__x86_64__) // 64 bit linux gcc version diff --git a/source/row_win.cc b/source/row_win.cc index f127f4835..c90372a14 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -12,6 +12,176 @@ extern "C" { +#ifdef HAS_ARGBTOYROW_SSSE3 +#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var + +// Constant multiplication table for converting ARGB to I400. +extern "C" TALIGN16(const int8, kRGBToY[16]) = { + 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 +}; + +extern "C" TALIGN16(const int8, kRGBToU[16]) = { + 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 +}; + +extern "C" TALIGN16(const int8, kRGBToV[16]) = { + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, +}; + +extern "C" TALIGN16(const uint8, kAddY16[16]) = { + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, +}; + +extern "C" TALIGN16(const uint8, kAddUV128[16]) = { + 128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u, + 128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u +}; + +__declspec(naked) +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +__asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + movdqa xmm7, _kRGBToY + movdqa xmm6, _kAddY16 + pcmpeqb xmm5, xmm5 // Generate mask 0x0000ffff + psrld xmm5, 16 + + convertloop : + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + pmaddubsw xmm0, xmm7 + lea eax, [eax + 32] + pmaddubsw xmm1, xmm7 // BG ra BG ra BG ra BG ra + palignr xmm2, xmm0, 2 // AR xx AR xx AR xx AR xx + paddw xmm2, xmm0 // BGRA xx BGRA xx BGRA xx BGRA xx + pand xmm2, xmm5 // BGRA 00 BGRA 00 BGRA 00 BGRA 00 + palignr xmm3, xmm1, 2 + paddw xmm3, xmm1 + pand xmm3, xmm5 // BGRA 00 BGRA 00 BGRA 00 BGRA 00 + packssdw xmm2, xmm3 // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA + psrlw xmm2, 7 // 0B xx 0B xx 0B xx 0B xx + packuswb xmm2, xmm2 + paddb xmm2, xmm6 + movq qword ptr [edx], xmm2 + lea edx, [edx + 8] + sub ecx, 8 + ja convertloop + ret + } +} + +__declspec(naked) +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { +__asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, _kRGBToU + movdqa xmm6, _kRGBToV + movdqa xmm5, _kAddUV128 + pcmpeqb xmm4, xmm4 // Generate mask 0x0000ffff + psrld xmm4, 16 + + convertloop : + // step 1 - subsample 8x2 argb pixels to 4x1 + movdqa xmm0, [eax] // 32x2 -> 32x1 + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // 32x1 -> 16x1 + shufps xmm0, xmm1, 0x88 + shufps xmm2, xmm1, 0xdd + pavgb xmm0, xmm2 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 8 different pixels, its 4 pixels of U and 4 of V + movdqa xmm1, xmm0 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm1, xmm6 // V + + palignr xmm2, xmm0, 2 // AR xx AR xx AR xx AR xx + paddw xmm2, xmm0 // BGRA xx BGRA xx BGRA xx BGRA xx + pand xmm2, xmm4 // BGRA 00 BGRA 00 BGRA 00 BGRA 00 + + palignr xmm3, xmm1, 2 + paddw xmm3, xmm1 + pand xmm3, xmm4 // BGRA 00 BGRA 00 BGRA 00 BGRA 00 + + psraw xmm2, 8 + psraw xmm3, 8 + packsswb xmm2, xmm3 // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA + paddb xmm2, xmm5 // -> unsigned + packuswb xmm2, xmm2 // 8 bytes. 4 U, 4 V + + // step 3 - store 4 U and 4 V values + movd dword ptr [edx], xmm2 // U + lea edx, [edx + 4] + pshufd xmm0, xmm2, 0x55 // V + movd dword ptr [edi], xmm0 + lea edi, [edi + 4] + sub ecx, 8 + ja convertloop + pop edi + pop esi + ret + } +} + +static inline int RGBToY(uint8 r, uint8 g, uint8 b) { + return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16; +} + +static inline int RGBToU(uint8 r, uint8 g, uint8 b) { + return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; +} +static inline int RGBToV(uint8 r, uint8 g, uint8 b) { + return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; +} + +void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { + for (int x = 0; x < width; ++x) { + dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]); + src_argb0 += 4; + dst_y += 1; + } +} + +void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + const uint8* src_argb1 = src_argb0 + src_stride_argb; + for (int x = 0; x < width - 1; x += 2) { + uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2; + uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2; + uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + src_argb0 += 8; + src_argb1 += 8; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1; + uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1; + uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + } +} + __declspec(naked) void FastConvertYUVToRGB32Row(const uint8* y_buf, const uint8* u_buf, @@ -200,4 +370,6 @@ void FastConvertYToRGB32Row(const uint8* y_buf, } } +#endif + } // extern "C"