diff --git a/include/libyuv/row.h b/include/libyuv/row.h index e164a502d..f3be39db5 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -559,6 +559,8 @@ extern "C" { #define HAS_I410ALPHATOARGBROW_NEON #define HAS_I210TOARGBROW_NEON #define HAS_I410TOARGBROW_NEON +#define HAS_I210TOAR30ROW_NEON +#define HAS_I410TOAR30ROW_NEON #define HAS_ABGRTOYJROW_NEON_DOTPROD #define HAS_ABGRTOYROW_NEON_DOTPROD @@ -1090,6 +1092,18 @@ void I410ToARGBRow_NEON(const uint16_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); +void I210ToAR30Row_NEON(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I410ToAR30Row_NEON(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -5104,6 +5118,18 @@ void I410ToARGBRow_Any_NEON(const uint16_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I210ToAR30Row_Any_NEON(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I410ToAR30Row_Any_NEON(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I444AlphaToARGBRow_Any_NEON(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 291bab63c..5c710343d 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -954,6 +954,14 @@ int I010ToAR30Matrix(const uint16_t* src_y, dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } +#if defined(HAS_I210TOAR30ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I210ToAR30Row = I210ToAR30Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I210ToAR30Row = I210ToAR30Row_NEON; + } + } +#endif #if defined(HAS_I210TOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I210ToAR30Row = I210ToAR30Row_Any_SSSE3; @@ -1172,6 +1180,14 @@ int I210ToAR30Matrix(const uint16_t* src_y, dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } +#if defined(HAS_I210TOAR30ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I210ToAR30Row = I210ToAR30Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I210ToAR30Row = I210ToAR30Row_NEON; + } + } +#endif #if defined(HAS_I210TOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I210ToAR30Row = I210ToAR30Row_Any_SSSE3; @@ -1327,6 +1343,14 @@ int I410ToAR30Matrix(const uint16_t* src_y, dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } +#if defined(HAS_I410TOAR30ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I410ToAR30Row = I410ToAR30Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I410ToAR30Row = I410ToAR30Row_NEON; + } + } +#endif #if defined(HAS_I410TOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I410ToAR30Row = I410ToAR30Row_Any_SSSE3; @@ -6699,6 +6723,14 @@ static int I010ToAR30MatrixBilinear(const uint16_t* src_y, dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } +#if defined(HAS_I410TOAR30ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I410ToAR30Row = I410ToAR30Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I410ToAR30Row = I410ToAR30Row_NEON; + } + } +#endif #if defined(HAS_I410TOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I410ToAR30Row = I410ToAR30Row_Any_SSSE3; @@ -6805,6 +6837,14 @@ static int I210ToAR30MatrixLinear(const uint16_t* src_y, dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } +#if defined(HAS_I410TOAR30ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I410ToAR30Row = I410ToAR30Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I410ToAR30Row = I410ToAR30Row_NEON; + } + } +#endif #if defined(HAS_I410TOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I410ToAR30Row = I410ToAR30Row_Any_SSSE3; diff --git a/source/row_any.cc b/source/row_any.cc index 57414f3f7..351aa4a2c 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -522,6 +522,12 @@ ANY31CT(I210ToARGBRow_Any_NEON, I210ToARGBRow_NEON, 1, 0, uint16_t, 2, 4, 7) #ifdef HAS_I410TOARGBROW_NEON ANY31CT(I410ToARGBRow_Any_NEON, I410ToARGBRow_NEON, 0, 0, uint16_t, 2, 4, 7) #endif +#ifdef HAS_I210TOAR30ROW_NEON +ANY31CT(I210ToAR30Row_Any_NEON, I210ToAR30Row_NEON, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I410TOAR30ROW_NEON +ANY31CT(I410ToAR30Row_Any_NEON, I410ToAR30Row_NEON, 0, 0, uint16_t, 2, 4, 7) +#endif #undef ANY31CT // Any 3 planes to 1 plane with parameter diff --git a/source/row_neon64.cc b/source/row_neon64.cc index bc5aeb07c..b2c8b61ce 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -182,6 +182,24 @@ static const uvec8 kNV21InterleavedTable = {1, 1, 5, 5, 9, 9, 13, 13, "uqshl v16.8h, v16.8h, #2 \n" \ "uqshl v18.8h, v18.8h, #2 \n" +// Store 2.14 fixed point RGB as AR30 elements +#define STOREAR30 \ + /* Inputs: \ + * v16.8h: xxbbbbbbbbbbxxxx \ + * v17.8h: xxggggggggggxxxx \ + * v18.8h: xxrrrrrrrrrrxxxx \ + * v22.8h: 0011111111110000 (umin limit) \ + * v23.8h: 1100000000000000 (alpha) \ + */ \ + "uqshl v0.8h, v16.8h, #2 \n" /* bbbbbbbbbbxxxxxx */ \ + "uqshl v1.8h, v17.8h, #2 \n" /* ggggggggggxxxxxx */ \ + "umin v2.8h, v18.8h, v22.8h \n" /* 00rrrrrrrrrrxxxx */ \ + "shl v4.8h, v1.8h, #4 \n" /* ggggggxxxxxx0000 */ \ + "orr v5.16b, v2.16b, v23.16b \n" /* 11rrrrrrrrrrxxxx */ \ + "sri v4.8h, v0.8h, #6 \n" /* ggggggbbbbbbbbbb */ \ + "sri v5.8h, v1.8h, #12 \n" /* 11rrrrrrrrrrgggg */ \ + "st2 {v4.8h, v5.8h}, [%[dst_ar30]], #32 \n" + #define YUVTORGB_REGS \ "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", \ "v25", "v26", "v27", "v28", "v29", "v30", "v31" @@ -233,6 +251,62 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y, : "cc", "memory", YUVTORGB_REGS); } +void I210ToAR30Row_NEON(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + const uvec8* uv_coeff = &yuvconstants->kUVCoeff; + const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; + uint16_t limit = 0x3ff0; + uint16_t alpha = 0xc000; + asm(YUVTORGB_SETUP + "dup v22.8h, %w[limit] \n" + "dup v23.8h, %w[alpha] \n" + "1: \n" READYUV210 NVTORGB + "subs %w[width], %w[width], #8 \n" STOREAR30 + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] + [limit] "r"(limit), // %[limit] + [alpha] "r"(alpha) // %[alpha] + : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); +} + +void I410ToAR30Row_NEON(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + const uvec8* uv_coeff = &yuvconstants->kUVCoeff; + const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; + uint16_t limit = 0x3ff0; + uint16_t alpha = 0xc000; + asm(YUVTORGB_SETUP + "dup v22.8h, %w[limit] \n" + "dup v23.8h, %w[alpha] \n" + "1: \n" READYUV410 NVTORGB + "subs %w[width], %w[width], #8 \n" STOREAR30 + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] + [limit] "r"(limit), // %[limit] + [alpha] "r"(alpha) // %[alpha] + : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); +} + void I210ToARGBRow_NEON(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v,