diff --git a/README.chromium b/README.chromium index b44f26f62..017031390 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1903 +Version: 1904 License: BSD License File: LICENSE Shipped: yes diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 84f35c4d0..e26e427d0 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1903 +#define LIBYUV_VERSION 1904 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_common.cc b/source/row_common.cc index 5182a1d8d..5e1551b99 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -662,15 +662,13 @@ static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) { } #endif -// LIBYUV_ARGBTOUV_PAVGB mimics x86 code that subsamples with 2 pavgb. +// ARM uses uint16 #if !defined(LIBYUV_ARGBTOUV_PAVGB) -static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) { - return STATIC_CAST( - uint8_t, ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8); +static __inline int RGBxToU(uint16_t r, uint16_t g, uint16_t b) { + return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8080) >> 8); } -static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) { - return STATIC_CAST( - uint8_t, ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8); +static __inline int RGBxToV(uint16_t r, uint16_t g, uint16_t b) { + return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8080) >> 8); } #endif @@ -713,7 +711,7 @@ static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) { } \ } #else -// ARM version does sum / 2 then multiply by 2x smaller coefficients +// ARM version does average of 4 pixels with rounding #define MAKEROWY(NAME, R, G, B, BPP) \ void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ int x; \ @@ -729,27 +727,27 @@ static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) { int x; \ for (x = 0; x < width - 1; x += 2) { \ uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ - src_rgb1[B + BPP] + 1) >> \ - 1; \ + src_rgb1[B + BPP] + 2) >> \ + 2; \ uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ - src_rgb1[G + BPP] + 1) >> \ - 1; \ + src_rgb1[G + BPP] + 2) >> \ + 2; \ uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ - src_rgb1[R + BPP] + 1) >> \ - 1; \ - dst_u[0] = RGB2xToU(ar, ag, ab); \ - dst_v[0] = RGB2xToV(ar, ag, ab); \ + src_rgb1[R + BPP] + 2) >> \ + 2; \ + dst_u[0] = RGBxToU(ar, ag, ab); \ + dst_v[0] = RGBxToV(ar, ag, ab); \ src_rgb += BPP * 2; \ src_rgb1 += BPP * 2; \ dst_u += 1; \ dst_v += 1; \ } \ if (width & 1) { \ - uint16_t ab = src_rgb[B] + src_rgb1[B]; \ - uint16_t ag = src_rgb[G] + src_rgb1[G]; \ - uint16_t ar = src_rgb[R] + src_rgb1[R]; \ - dst_u[0] = RGB2xToU(ar, ag, ab); \ - dst_v[0] = RGB2xToV(ar, ag, ab); \ + uint16_t ab = (src_rgb[B] + src_rgb1[B] + 1) >> 1; \ + uint16_t ag = (src_rgb[G] + src_rgb1[G] + 1) >> 1; \ + uint16_t ar = (src_rgb[R] + src_rgb1[R] + 1) >> 1; \ + dst_u[0] = RGBxToU(ar, ag, ab); \ + dst_v[0] = RGBxToV(ar, ag, ab); \ } \ } #endif @@ -806,11 +804,11 @@ static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; } #if !defined(LIBYUV_ARGBTOUV_PAVGB) -static __inline uint8_t RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) { - return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8; +static __inline uint8_t RGBxToUJ(uint16_t r, uint16_t g, uint16_t b) { + return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; } -static __inline uint8_t RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) { - return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8; +static __inline uint8_t RGBxToVJ(uint16_t r, uint16_t g, uint16_t b) { + return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; } #endif @@ -853,7 +851,7 @@ static __inline uint8_t RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) { } \ } #else -// ARM version does sum / 2 then multiply by 2x smaller coefficients +// ARM version does average of 4 pixels with rounding #define MAKEROWYJ(NAME, R, G, B, BPP) \ void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ int x; \ @@ -869,27 +867,27 @@ static __inline uint8_t RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) { int x; \ for (x = 0; x < width - 1; x += 2) { \ uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ - src_rgb1[B + BPP] + 1) >> \ - 1; \ + src_rgb1[B + BPP] + 2) >> \ + 2; \ uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ - src_rgb1[G + BPP] + 1) >> \ - 1; \ + src_rgb1[G + BPP] + 2) >> \ + 2; \ uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ - src_rgb1[R + BPP] + 1) >> \ - 1; \ - dst_u[0] = RGB2xToUJ(ar, ag, ab); \ - dst_v[0] = RGB2xToVJ(ar, ag, ab); \ + src_rgb1[R + BPP] + 2) >> \ + 2; \ + dst_u[0] = RGBxToUJ(ar, ag, ab); \ + dst_v[0] = RGBxToVJ(ar, ag, ab); \ src_rgb += BPP * 2; \ src_rgb1 += BPP * 2; \ dst_u += 1; \ dst_v += 1; \ } \ if (width & 1) { \ - uint16_t ab = (src_rgb[B] + src_rgb1[B]); \ - uint16_t ag = (src_rgb[G] + src_rgb1[G]); \ - uint16_t ar = (src_rgb[R] + src_rgb1[R]); \ - dst_u[0] = RGB2xToUJ(ar, ag, ab); \ - dst_v[0] = RGB2xToVJ(ar, ag, ab); \ + uint16_t ab = (src_rgb[B] + src_rgb1[B] + 1) >> 1; \ + uint16_t ag = (src_rgb[G] + src_rgb1[G] + 1) >> 1; \ + uint16_t ar = (src_rgb[R] + src_rgb1[R] + 1) >> 1; \ + dst_u[0] = RGBxToUJ(ar, ag, ab); \ + dst_v[0] = RGBxToVJ(ar, ag, ab); \ } \ } @@ -994,11 +992,11 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565, dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); #else - uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; - uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; - uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; - dst_u[0] = RGB2xToU(r, g, b); - dst_v[0] = RGB2xToV(r, g, b); + uint16_t b = (b0 + b1 + b2 + b3 + 2) >> 2; + uint16_t g = (g0 + g1 + g2 + g3 + 2) >> 2; + uint16_t r = (r0 + r1 + r2 + r3 + 2) >> 2; + dst_u[0] = RGBxToU(r, g, b); + dst_v[0] = RGBxToV(r, g, b); #endif src_rgb565 += 4; @@ -1029,11 +1027,11 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565, dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); #else - uint16_t b = b0 + b2; - uint16_t g = g0 + g2; - uint16_t r = r0 + r2; - dst_u[0] = RGB2xToU(r, g, b); - dst_v[0] = RGB2xToV(r, g, b); + uint16_t b = (b0 + b2 + 1) >> 1; + uint16_t g = (g0 + g2 + 1) >> 1; + uint16_t r = (r0 + r2 + 1) >> 1; + dst_u[0] = RGBxToU(r, g, b); + dst_v[0] = RGBxToV(r, g, b); #endif } } @@ -1083,11 +1081,11 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); #else - uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; - uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; - uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; - dst_u[0] = RGB2xToU(r, g, b); - dst_v[0] = RGB2xToV(r, g, b); + uint16_t b = (b0 + b1 + b2 + b3 + 2) >> 2; + uint16_t g = (g0 + g1 + g2 + g3 + 2) >> 2; + uint16_t r = (r0 + r1 + r2 + r3 + 2) >> 2; + dst_u[0] = RGBxToU(r, g, b); + dst_v[0] = RGBxToV(r, g, b); #endif src_argb1555 += 4; @@ -1119,11 +1117,11 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); #else - uint16_t b = b0 + b2; - uint16_t g = g0 + g2; - uint16_t r = r0 + r2; - dst_u[0] = RGB2xToU(r, g, b); - dst_v[0] = RGB2xToV(r, g, b); + uint16_t b = (b0 + b2 + 1) >> 1; + uint16_t g = (g0 + g2 + 1) >> 1; + uint16_t r = (r0 + r2 + 1) >> 1; + dst_u[0] = RGBxToU(r, g, b); + dst_v[0] = RGBxToV(r, g, b); #endif } } @@ -1169,11 +1167,11 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); #else - uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; - uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; - uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; - dst_u[0] = RGB2xToU(r, g, b); - dst_v[0] = RGB2xToV(r, g, b); + uint16_t b = (b0 + b1 + b2 + b3 + 2) >> 2; + uint16_t g = (g0 + g1 + g2 + g3 + 2) >> 2; + uint16_t r = (r0 + r1 + r2 + r3 + 2) >> 2; + dst_u[0] = RGBxToU(r, g, b); + dst_v[0] = RGBxToV(r, g, b); #endif src_argb4444 += 4; @@ -1203,11 +1201,11 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); #else - uint16_t b = b0 + b2; - uint16_t g = g0 + g2; - uint16_t r = r0 + r2; - dst_u[0] = RGB2xToU(r, g, b); - dst_v[0] = RGB2xToV(r, g, b); + uint16_t b = (b0 + b2 + 1) >> 1; + uint16_t g = (g0 + g2 + 1) >> 1; + uint16_t r = (r0 + r2 + 1) >> 1; + dst_u[0] = RGBxToU(r, g, b); + dst_v[0] = RGBxToV(r, g, b); #endif } } diff --git a/source/row_neon.cc b/source/row_neon.cc index 49d7584dc..16ad3a936 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1933,11 +1933,11 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.s16 q10, #112 \n" // UB/VR 0.875 coefficient + "vmov.s16 q11, #74 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. @@ -1952,9 +1952,9 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q0, q0, #2 \n" // average of 4 + "vrshr.u16 q1, q1, #2 \n" + "vrshr.u16 q2, q2, #2 \n" RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. @@ -1971,7 +1971,6 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, ); } -// TODO(fbarchard): Subsample match Intel code. void ARGBToUVJRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -1979,11 +1978,11 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient - "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient - "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient - "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient - "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.s16 q10, #127 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 \n" // VG -0.41869 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. @@ -1998,9 +1997,9 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q0, q0, #2 \n" // average of 4 + "vrshr.u16 q1, q1, #2 \n" + "vrshr.u16 q2, q2, #2 \n" RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. @@ -2024,11 +2023,11 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient - "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient - "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient - "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient - "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.s16 q10, #127 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 \n" // VG -0.41869 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. @@ -2043,9 +2042,9 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr, "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q0, q0, #2 \n" // average of 4 + "vrshr.u16 q1, q1, #2 \n" + "vrshr.u16 q2, q2, #2 \n" RGBTOUV(q2, q1, q0) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. @@ -2070,11 +2069,11 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgb24 - "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient - "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient - "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient - "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient - "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.s16 q10, #127 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 \n" // VG -0.41869 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. @@ -2089,9 +2088,9 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q0, q0, #2 \n" // average of 4 + "vrshr.u16 q1, q1, #2 \n" + "vrshr.u16 q2, q2, #2 \n" RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. @@ -2116,11 +2115,11 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_raw - "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient - "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient - "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient - "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient - "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.s16 q10, #127 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 \n" // VG -0.41869 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. @@ -2135,9 +2134,9 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw, "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q0, q0, #2 \n" // average of 4 + "vrshr.u16 q1, q1, #2 \n" + "vrshr.u16 q2, q2, #2 \n" RGBTOUV(q2, q1, q0) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. @@ -2161,11 +2160,11 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_bgra - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.s16 q10, #112 \n" // UB/VR 0.875 coefficient + "vmov.s16 q11, #74 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. @@ -2180,9 +2179,9 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. - "vrshr.u16 q1, q1, #1 \n" // 2x average - "vrshr.u16 q2, q2, #1 \n" - "vrshr.u16 q3, q3, #1 \n" + "vrshr.u16 q1, q1, #2 \n" // average of 4 + "vrshr.u16 q2, q2, #2 \n" + "vrshr.u16 q3, q3, #2 \n" RGBTOUV(q3, q2, q1) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. @@ -2190,7 +2189,7 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, "bgt 1b \n" : "+r"(src_bgra), // %0 "+r"(src_stride_bgra), // %1 - "+r"(dst_u), // %2 + "+r"(dst_u), // %2- "+r"(dst_v), // %3 "+r"(width) // %4 : @@ -2206,11 +2205,11 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_abgr - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.s16 q10, #112 \n" // UB/VR 0.875 coefficient + "vmov.s16 q11, #74 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. @@ -2225,9 +2224,9 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q0, q0, #2 \n" // average of 4 + "vrshr.u16 q1, q1, #2 \n" + "vrshr.u16 q2, q2, #2 \n" RGBTOUV(q2, q1, q0) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. @@ -2251,11 +2250,11 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgba - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.s16 q10, #112 \n" // UB/VR 0.875 coefficient + "vmov.s16 q11, #74 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. @@ -2270,9 +2269,9 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q0, q0, #2 \n" // average of 4 + "vrshr.u16 q1, q1, #2 \n" + "vrshr.u16 q2, q2, #2 \n" RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. @@ -2296,11 +2295,11 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgb24 - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.s16 q10, #112 \n" // UB/VR 0.875 coefficient + "vmov.s16 q11, #74 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. @@ -2315,9 +2314,9 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q0, q0, #2 \n" // average of 4 + "vrshr.u16 q1, q1, #2 \n" + "vrshr.u16 q2, q2, #2 \n" RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. @@ -2341,11 +2340,11 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_raw - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.s16 q10, #112 \n" // UB/VR 0.875 coefficient + "vmov.s16 q11, #74 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. @@ -2360,9 +2359,9 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q0, q0, #2 \n" // average of 4 + "vrshr.u16 q1, q1, #2 \n" + "vrshr.u16 q2, q2, #2 \n" RGBTOUV(q2, q1, q0) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. @@ -2387,12 +2386,11 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, int width) { asm volatile( "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 - // coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.s16 q10, #112 \n" // UB/VR 0.875 coefficient + "vmov.s16 q11, #74 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. @@ -2418,9 +2416,9 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" + "vrshr.u16 q4, q4, #2 \n" // average of 4 + "vrshr.u16 q5, q5, #2 \n" + "vrshr.u16 q6, q6, #2 \n" "vmul.s16 q8, q4, q10 \n" // B "vmls.s16 q8, q5, q11 \n" // G @@ -2453,12 +2451,11 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, int width) { asm volatile( "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 - // coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.s16 q10, #112 \n" // UB/VR 0.875 coefficient + "vmov.s16 q11, #74 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. @@ -2484,9 +2481,9 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" + "vrshr.u16 q4, q4, #2 \n" // average of 4 + "vrshr.u16 q5, q5, #2 \n" + "vrshr.u16 q6, q6, #2 \n" "vmul.s16 q8, q4, q10 \n" // B "vmls.s16 q8, q5, q11 \n" // G @@ -2519,12 +2516,11 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 - // coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.s16 q10, #112 \n" // UB/VR 0.875 coefficient + "vmov.s16 q11, #74 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. @@ -2550,9 +2546,9 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vrshr.u16 q0, q4, #1 \n" // 2x average - "vrshr.u16 q1, q5, #1 \n" - "vrshr.u16 q2, q6, #1 \n" + "vrshr.u16 q0, q4, #2 \n" // average of 4 + "vrshr.u16 q1, q5, #2 \n" + "vrshr.u16 q2, q6, #2 \n" RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. diff --git a/source/row_neon64.cc b/source/row_neon64.cc index c30ef680c..71e132876 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2853,13 +2853,13 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb, &kRgb24JPEGUVConstants); } -#define RGBTOUV_SETUP_REG \ - "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ - "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ - "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ - "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ - "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ - "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ +#define RGBTOUV_SETUP_REG \ + "movi v20.8h, #112 \n" /* UB/VR coefficient (0.875) */ \ + "movi v21.8h, #74 \n" /* UG coefficient (-0.5781) */ \ + "movi v22.8h, #38 \n" /* UR coefficient (-0.2969) */ \ + "movi v23.8h, #18 \n" /* VB coefficient (-0.1406) */ \ + "movi v24.8h, #94 \n" /* VG coefficient (-0.7344) */ \ + "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. // clang-format off @@ -2899,9 +2899,9 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" + "urshr v0.8h, v0.8h, #2 \n" // average of 4 + "urshr v1.8h, v1.8h, #2 \n" + "urshr v2.8h, v2.8h, #2 \n" RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -2918,7 +2918,6 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, ); } -// TODO(fbarchard): Subsample match Intel code. void ARGBToUVJRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -2926,11 +2925,11 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, int width) { const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( - "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 - "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 - "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 - "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 - "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v20.8h, #127 \n" // UB/VR coeff (0.500) + "movi v21.8h, #84 \n" // UG coeff (-0.33126) + "movi v22.8h, #43 \n" // UR coeff (-0.16874) + "movi v23.8h, #20 \n" // VB coeff (-0.08131) + "movi v24.8h, #107 \n" // VG coeff (-0.41869) "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. @@ -2945,9 +2944,9 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" + "urshr v0.8h, v0.8h, #2 \n" // average of 4 + "urshr v1.8h, v1.8h, #2 \n" + "urshr v2.8h, v2.8h, #2 \n" RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -2971,11 +2970,11 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr, int width) { const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; asm volatile ( - "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 - "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 - "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 - "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 - "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v20.8h, #127 \n" // UB/VR coeff (0.500) + "movi v21.8h, #84 \n" // UG coeff (-0.33126) + "movi v22.8h, #43 \n" // UR coeff (-0.16874) + "movi v23.8h, #20 \n" // VB coeff (-0.08131) + "movi v24.8h, #107 \n" // VG coeff (-0.41869) "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. @@ -2990,9 +2989,9 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr, "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" + "urshr v0.8h, v0.8h, #2 \n" // average of 4 + "urshr v1.8h, v1.8h, #2 \n" + "urshr v2.8h, v2.8h, #2 \n" RGBTOUV(v2.8h, v1.8h, v0.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -3016,11 +3015,11 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, int width) { const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24; asm volatile ( - "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 - "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 - "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 - "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 - "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v20.8h, #127 \n" // UB/VR coeff (0.500) + "movi v21.8h, #84 \n" // UG coeff (-0.33126) + "movi v22.8h, #43 \n" // UR coeff (-0.16874) + "movi v23.8h, #20 \n" // VB coeff (-0.08131) + "movi v24.8h, #107 \n" // VG coeff (-0.41869) "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. @@ -3035,9 +3034,9 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" + "urshr v0.8h, v0.8h, #2 \n" // average of 4 + "urshr v1.8h, v1.8h, #2 \n" + "urshr v2.8h, v2.8h, #2 \n" RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -3061,11 +3060,11 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw, int width) { const uint8_t* src_raw_1 = src_raw + src_stride_raw; asm volatile ( - "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 - "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 - "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 - "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 - "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v20.8h, #127 \n" // UB/VR coeff (0.500) + "movi v21.8h, #84 \n" // UG coeff (-0.33126) + "movi v22.8h, #43 \n" // UR coeff (-0.16874) + "movi v23.8h, #20 \n" // VB coeff (-0.08131) + "movi v24.8h, #107 \n" // VG coeff (-0.41869) "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. @@ -3080,9 +3079,9 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw, "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" + "urshr v0.8h, v0.8h, #2 \n" // average of 4 + "urshr v1.8h, v1.8h, #2 \n" + "urshr v2.8h, v2.8h, #2 \n" RGBTOUV(v2.8h, v1.8h, v0.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -3120,9 +3119,9 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v3.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" + "urshr v0.8h, v0.8h, #2 \n" // average of 4 + "urshr v1.8h, v3.8h, #2 \n" + "urshr v2.8h, v2.8h, #2 \n" RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -3160,9 +3159,9 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. - "urshr v0.8h, v3.8h, #1 \n" // 2x average - "urshr v2.8h, v2.8h, #1 \n" - "urshr v1.8h, v1.8h, #1 \n" + "urshr v0.8h, v3.8h, #2 \n" // average of 4 + "urshr v2.8h, v2.8h, #2 \n" + "urshr v1.8h, v1.8h, #2 \n" RGBTOUV(v0.8h, v2.8h, v1.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -3200,9 +3199,9 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" + "urshr v0.8h, v0.8h, #2 \n" // average of 4 + "urshr v1.8h, v1.8h, #2 \n" + "urshr v2.8h, v2.8h, #2 \n" RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -3240,9 +3239,9 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" + "urshr v0.8h, v0.8h, #2 \n" // average of 4 + "urshr v1.8h, v1.8h, #2 \n" + "urshr v2.8h, v2.8h, #2 \n" RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -3280,9 +3279,9 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. - "urshr v2.8h, v2.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v0.8h, v0.8h, #1 \n" + "urshr v2.8h, v2.8h, #2 \n" // average of 4 + "urshr v1.8h, v1.8h, #2 \n" + "urshr v0.8h, v0.8h, #2 \n" RGBTOUV(v2.8h, v1.8h, v0.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -3324,9 +3323,9 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, "uadalp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uadalp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "urshr v0.8h, v16.8h, #1 \n" // 2x average - "urshr v1.8h, v17.8h, #1 \n" - "urshr v2.8h, v18.8h, #1 \n" + "urshr v0.8h, v16.8h, #2 \n" // average of 4 + "urshr v1.8h, v17.8h, #2 \n" + "urshr v2.8h, v18.8h, #2 \n" RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -3368,9 +3367,9 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, "uadalp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uadalp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "urshr v0.8h, v16.8h, #1 \n" // 2x average - "urshr v1.8h, v17.8h, #1 \n" - "urshr v2.8h, v18.8h, #1 \n" + "urshr v0.8h, v16.8h, #2 \n" // average of 4 + "urshr v1.8h, v17.8h, #2 \n" + "urshr v2.8h, v18.8h, #2 \n" RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -3412,9 +3411,9 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, "uadalp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uadalp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "urshr v0.8h, v16.8h, #1 \n" // 2x average - "urshr v1.8h, v17.8h, #1 \n" - "urshr v2.8h, v18.8h, #1 \n" + "urshr v0.8h, v16.8h, #2 \n" // average of 4 + "urshr v1.8h, v17.8h, #2 \n" + "urshr v2.8h, v18.8h, #2 \n" RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. diff --git a/source/row_sve.cc b/source/row_sve.cc index 0bab8e16f..27bf87a6c 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -205,32 +205,32 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y, static const int16_t kARGBToUVCoefficients[] = { // UB, -UR, -UG, 0, -VB, VR, -VG, 0 - 56, -19, -37, 0, -9, 56, -47, 0, + 112, -38, -74, 0, -18, 112, -94, 0, }; static const int16_t kRGBAToUVCoefficients[] = { // 0, -UG, UB, -UR, 0, -VG, -VB, VR - 0, -37, 56, -19, 0, -47, -9, 56, + 0, -74, 112, -38, 0, -94, -18, 112, }; static const int16_t kBGRAToUVCoefficients[] = { // 0, -UG, -UR, UB, 0, -VG, VR, -VB - 0, -37, -19, 56, 0, -47, 56, -9, + 0, -74, -38, 112, 0, -94, 112, -18, }; static const int16_t kABGRToUVCoefficients[] = { // -UR, UB, -UG, 0, VR, -VB, -VG, 0 - -19, 56, -37, 0, 56, -9, -47, 0, + -38, 112, -74, 0, 112, -18, -94, 0, }; static const int16_t kARGBToUVJCoefficients[] = { // UB, -UR, -UG, 0, -VB, VR, -VG, 0 - 63, -21, -42, 0, -10, 63, -53, 0, + 127, -43, -84, 0, -20, 127, -107, 0, }; static const int16_t kABGRToUVJCoefficients[] = { // -UR, UB, -UG, 0, VR, -VB, -VG, 0 - -21, 63, -42, 0, 63, -10, -53, 0, + -43, 127, -84, 0, 127, -20, -107, 0, }; static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, @@ -285,10 +285,15 @@ static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, "subs %w[width], %w[width], %w[vl] \n" // 4*VL per loop - "urhadd z0.h, p0/m, z0.h, z1.h \n" // brgabrga - "urhadd z2.h, p0/m, z2.h, z3.h \n" // brgabrga - "urhadd z4.h, p0/m, z4.h, z5.h \n" // brgabrga - "urhadd z6.h, p0/m, z6.h, z7.h \n" // brgabrga + "add z0.h, p0/m, z0.h, z1.h \n" // brgabrga + "add z2.h, p0/m, z2.h, z3.h \n" // brgabrga + "add z4.h, p0/m, z4.h, z5.h \n" // brgabrga + "add z6.h, p0/m, z6.h, z7.h \n" // brgabrga + + "urshr z0.h, p0/m, z0.h, #2 \n" // brgabrga + "urshr z2.h, p0/m, z2.h, #2 \n" // brgabrga + "urshr z4.h, p0/m, z4.h, #2 \n" // brgabrga + "urshr z6.h, p0/m, z6.h, #2 \n" // brgabrga "movi v16.8h, #0 \n" "movi v17.8h, #0 \n" @@ -350,7 +355,9 @@ static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, "trn1 z0.s, z16.s, z17.s \n" // brgabgra "trn2 z1.s, z16.s, z17.s \n" // brgabgra - "urhadd z0.h, p0/m, z0.h, z1.h \n" // brgabrga + "add z0.h, p0/m, z0.h, z1.h \n" // brgabrga + + "urshr z0.h, p0/m, z0.h, #2 \n" // brgabrga "subs %w[width], %w[width], %w[vl] \n" // VL per loop diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index be36343b0..eb0d4bbd9 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -2076,7 +2076,7 @@ TEST_F(LibYUVConvertTest, TestRGB24ToJ420) { } uint32_t checksum = HashDjb2(dest_j420, kSize * 3 / 2 * 2, 5381); - EXPECT_EQ(2755440272u, checksum); + EXPECT_EQ(4157186353u, checksum); free_aligned_buffer_page_end(orig_rgb24); free_aligned_buffer_page_end(dest_j420);