diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 815ac6a5a..7fce069ff 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -405,6 +405,7 @@ extern "C" { #define HAS_BGRATOYROW_NEON #define HAS_BYTETOFLOATROW_NEON #define HAS_CONVERT16TO8ROW_NEON +#define HAS_CONVERT8TO8ROW_NEON #define HAS_COPYROW_NEON #define HAS_DETILEROW_16_NEON #define HAS_DETILEROW_NEON @@ -507,7 +508,6 @@ extern "C" { // The following are available on AArch64 platforms: #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) -#define HAS_CONVERT8TO8ROW_NEON #define HAS_ARGBTOAR30ROW_NEON #define HAS_ABGRTOAR30ROW_NEON #define HAS_I210ALPHATOARGBROW_NEON diff --git a/source/row_neon.cc b/source/row_neon.cc index 8c51b6bb3..9b2b80d7b 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -143,9 +143,8 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUV444 YUVTORGB - RGBTORGB8 - "subs %[width], %[width], #8 \n" + "1: \n" READYUV444 + "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -166,9 +165,8 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV444 YUVTORGB - RGBTORGB8 - "subs %[width], %[width], #8 \n" + "1: \n" READYUV444 + "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -190,9 +188,8 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 - "subs %[width], %[width], #8 \n" + "1: \n" READYUV422 + "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -214,10 +211,9 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV444 YUVTORGB - RGBTORGB8 + "1: \n" READYUV444 + "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vld1.8 {d6}, [%[src_a]]! \n" - "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -240,10 +236,9 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 + "1: \n" READYUV422 + "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vld1.8 {d6}, [%[src_a]]! \n" - "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -266,9 +261,9 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 "subs %[width], %[width], #8 \n" STORERGBA - "bgt 1b \n" + "1: \n" READYUV422 + "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 + STORERGBA "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -288,9 +283,8 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 - "subs %[width], %[width], #8 \n" + "1: \n" READYUV422 + "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -319,8 +313,9 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTORGB565 + "1: \n" READYUV422 + "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 + ARGBTORGB565 "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565. "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -350,9 +345,8 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 - "subs %[width], %[width], #8 \n" + "1: \n" READYUV422 + "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vmov.u8 d6, #0xff \n" ARGBTOARGB1555 "vst1.8 {q3}, [%[dst_argb1555]]! \n" // store 8 pixels RGB1555. "bgt 1b \n" @@ -425,9 +419,9 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { "vmov.u8 d23, #255 \n" "1: \n" "vld1.8 {d20}, [%0]! \n" + "subs %2, %2, #8 \n" "vmov d21, d20 \n" "vmov d22, d20 \n" - "subs %2, %2, #8 \n" "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -731,6 +725,7 @@ void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { "vld1.8 {q11}, [%0]! \n" "vld1.8 {q13}, [%0]! \n" "vld1.8 {q15}, [%0]! \n" + "subs %2, %2, #80 \n" "vshl.u8 q8, q14, #6 \n" // Shift lower bit data // appropriately. "vshl.u8 q10, q14, #4 \n" @@ -753,7 +748,6 @@ void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { "vsri.u16 q15, q15, #10 \n" "vstmia %1!, {q8-q15} \n" // Store pixel block (64 // pixels). - "subs %2, %2, #80 \n" "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -954,6 +948,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r, "vld1.16 {d4}, [%2]! \n" // B "vld1.16 {d2}, [%1]! \n" // G "vld1.16 {d0}, [%0]! \n" // R + "subs %4, %4, #4 \n" "vmovl.u16 q2, d4 \n" // B "vmovl.u16 q1, d2 \n" // G "vmovl.u16 q0, d0 \n" // R @@ -966,7 +961,6 @@ void MergeXR30Row_NEON(const uint16_t* src_r, "vsli.u32 q2, q1, #10 \n" // 00GB "vsli.u32 q2, q0, #20 \n" // 0RGB "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) - "subs %4, %4, #4 \n" "vst1.8 {q2}, [%3]! \n" "bgt 1b \n" : "+r"(src_r), // %0 @@ -990,6 +984,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r, "vld1.16 {d4}, [%2]! \n" // B "vld1.16 {d2}, [%1]! \n" // G "vld1.16 {d0}, [%0]! \n" // R + "subs %4, %4, #4 \n" "vmovl.u16 q2, d4 \n" // 000B "vmovl.u16 q1, d2 \n" // G "vmovl.u16 q0, d0 \n" // R @@ -999,7 +994,6 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r, "vsli.u32 q2, q1, #10 \n" // 00GB "vsli.u32 q2, q0, #20 \n" // 0RGB "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) - "subs %4, %4, #4 \n" "vst1.8 {q2}, [%3]! \n" "bgt 1b \n" "3: \n" @@ -1030,6 +1024,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r, "vld1.16 {q1}, [%1]! \n" // G "vld1.16 {q0}, [%2]! \n" // B "vld1.16 {q3}, [%3]! \n" // A + "subs %5, %5, #8 \n" "vmin.u16 q2, q2, q14 \n" "vmin.u16 q1, q1, q14 \n" "vmin.u16 q0, q0, q14 \n" @@ -1038,7 +1033,6 @@ void MergeAR64Row_NEON(const uint16_t* src_r, "vshl.u16 q1, q1, q15 \n" "vshl.u16 q0, q0, q15 \n" "vshl.u16 q3, q3, q15 \n" - "subs %5, %5, #8 \n" "vst4.16 {d0, d2, d4, d6}, [%4]! \n" "vst4.16 {d1, d3, d5, d7}, [%4]! \n" "bgt 1b \n" @@ -1070,13 +1064,13 @@ void MergeXR64Row_NEON(const uint16_t* src_r, "vld1.16 {q2}, [%0]! \n" // R "vld1.16 {q1}, [%1]! \n" // G "vld1.16 {q0}, [%2]! \n" // B + "subs %4, %4, #8 \n" "vmin.u16 q2, q2, q14 \n" "vmin.u16 q1, q1, q14 \n" "vmin.u16 q0, q0, q14 \n" "vshl.u16 q2, q2, q15 \n" "vshl.u16 q1, q1, q15 \n" "vshl.u16 q0, q0, q15 \n" - "subs %4, %4, #8 \n" "vst4.16 {d0, d2, d4, d6}, [%3]! \n" "vst4.16 {d1, d3, d5, d7}, [%3]! \n" "bgt 1b \n" @@ -1106,6 +1100,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r, "vld1.16 {q1}, [%1]! \n" // G "vld1.16 {q0}, [%2]! \n" // B "vld1.16 {q3}, [%3]! \n" // A + "subs %5, %5, #8 \n" "vshl.u16 q2, q2, q15 \n" "vshl.u16 q1, q1, q15 \n" "vshl.u16 q0, q0, q15 \n" @@ -1114,7 +1109,6 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r, "vqmovn.u16 d1, q1 \n" "vqmovn.u16 d2, q2 \n" "vqmovn.u16 d3, q3 \n" - "subs %5, %5, #8 \n" "vst4.8 {d0, d1, d2, d3}, [%4]! \n" "bgt 1b \n" : "+r"(src_r), // %0 @@ -1142,13 +1136,13 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r, "vld1.16 {q2}, [%0]! \n" // R "vld1.16 {q1}, [%1]! \n" // G "vld1.16 {q0}, [%2]! \n" // B + "subs %4, %4, #8 \n" "vshl.u16 q2, q2, q15 \n" "vshl.u16 q1, q1, q15 \n" "vshl.u16 q0, q0, q15 \n" "vqmovn.u16 d5, q2 \n" "vqmovn.u16 d4, q1 \n" "vqmovn.u16 d3, q0 \n" - "subs %4, %4, #8 \n" "vst4.u8 {d3, d4, d5, d6}, [%3]! \n" "bgt 1b \n" : "+r"(src_r), // %0 @@ -1600,8 +1594,8 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, "add %1, %0, %1 \n" // stride + src_yuy2 "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "vrhadd.u8 d1, d1, d5 \n" // average rows of U "vrhadd.u8 d3, d3, d7 \n" // average rows of V "vst1.8 {d1}, [%2]! \n" // store 8 U. @@ -1627,8 +1621,8 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, "add %1, %0, %1 \n" // stride + src_uyvy "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "vrhadd.u8 d0, d0, d4 \n" // average rows of U "vrhadd.u8 d2, d2, d6 \n" // average rows of V "vst1.8 {d0}, [%2]! \n" // store 8 U. @@ -1924,6 +1918,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "subs %4, %4, #16 \n" // 16 processed per loop. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. @@ -1937,7 +1932,6 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, "vrshr.u16 q1, q1, #1 \n" "vrshr.u16 q2, q2, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. @@ -1970,6 +1964,7 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "subs %4, %4, #16 \n" // 16 processed per loop. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. @@ -1983,7 +1978,6 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, "vrshr.u16 q1, q1, #1 \n" "vrshr.u16 q2, q2, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. @@ -2015,6 +2009,7 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr, "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. + "subs %4, %4, #16 \n" // 16 processed per loop. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. @@ -2028,7 +2023,6 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr, "vrshr.u16 q1, q1, #1 \n" "vrshr.u16 q2, q2, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q2, q1, q0) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. @@ -2061,6 +2055,7 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. + "subs %4, %4, #16 \n" // 16 processed per loop. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. @@ -2074,7 +2069,6 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, "vrshr.u16 q1, q1, #1 \n" "vrshr.u16 q2, q2, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. @@ -2107,6 +2101,7 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw, "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. + "subs %4, %4, #16 \n" // 16 processed per loop. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. @@ -2120,7 +2115,6 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw, "vrshr.u16 q1, q1, #1 \n" "vrshr.u16 q2, q2, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q2, q1, q0) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. @@ -2152,6 +2146,7 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. + "subs %4, %4, #16 \n" // 16 processed per loop. "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. @@ -2165,7 +2160,6 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, "vrshr.u16 q2, q2, #1 \n" "vrshr.u16 q3, q3, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q3, q2, q1) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. @@ -2197,6 +2191,7 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. + "subs %4, %4, #16 \n" // 16 processed per loop. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. @@ -2210,7 +2205,6 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, "vrshr.u16 q1, q1, #1 \n" "vrshr.u16 q2, q2, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q2, q1, q0) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. @@ -2242,6 +2236,7 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. + "subs %4, %4, #16 \n" // 16 processed per loop. "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. @@ -2255,7 +2250,6 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, "vrshr.u16 q1, q1, #1 \n" "vrshr.u16 q2, q2, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. @@ -2287,6 +2281,7 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. + "subs %4, %4, #16 \n" // 16 processed per loop. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. @@ -2300,7 +2295,6 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, "vrshr.u16 q1, q1, #1 \n" "vrshr.u16 q2, q2, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. @@ -2332,6 +2326,7 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. + "subs %4, %4, #16 \n" // 16 processed per loop. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. @@ -2345,7 +2340,6 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, "vrshr.u16 q1, q1, #1 \n" "vrshr.u16 q2, q2, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q2, q1, q0) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. @@ -2378,6 +2372,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %4, %4, #16 \n" // 16 processed per loop. RGB565TOARGB "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. @@ -2403,7 +2398,6 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, "vrshr.u16 q5, q5, #1 \n" "vrshr.u16 q6, q6, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. "vmul.s16 q8, q4, q10 \n" // B "vmls.s16 q8, q5, q11 \n" // G "vmls.s16 q8, q6, q12 \n" // R @@ -2444,6 +2438,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %4, %4, #16 \n" // 16 processed per loop. RGB555TOARGB "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. @@ -2469,7 +2464,6 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, "vrshr.u16 q5, q5, #1 \n" "vrshr.u16 q6, q6, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. "vmul.s16 q8, q4, q10 \n" // B "vmls.s16 q8, q5, q11 \n" // G "vmls.s16 q8, q6, q12 \n" // R @@ -2510,6 +2504,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %4, %4, #16 \n" // 16 processed per loop. ARGB4444TOARGB "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. @@ -2535,7 +2530,6 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, "vrshr.u16 q1, q5, #1 \n" "vrshr.u16 q2, q6, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. @@ -2633,9 +2627,9 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb, "1: \n" "vld1.8 {q0}, [%0]! \n" "vld1.8 {q2}, [%0]! \n" + "subs %2, %2, #8 \n" // 8 processed per loop. "vmov.u8 q1, q0 \n" "vmov.u8 q3, q2 \n" - "subs %2, %2, #8 \n" // 8 processed per loop. "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels "bgt 1b \n" @@ -2658,13 +2652,13 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb, "1: \n" "vld1.8 {q0}, [%0]! \n" "vld1.8 {q2}, [%0]! \n" + "subs %2, %2, #8 \n" // 8 processed per loop. "vtbl.8 d2, {d0, d1}, d8 \n" "vtbl.8 d3, {d0, d1}, d9 \n" "vtbl.8 d6, {d4, d5}, d8 \n" "vtbl.8 d7, {d4, d5}, d9 \n" "vmov.u8 q0, q1 \n" "vmov.u8 q2, q3 \n" - "subs %2, %2, #8 \n" // 8 processed per loop. "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels "bgt 1b \n" @@ -2684,11 +2678,11 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64, "vld1.16 {q1}, [%0]! \n" "vld1.16 {q2}, [%0]! \n" "vld1.16 {q3}, [%0]! \n" + "subs %2, %2, #8 \n" // 8 processed per loop. "vshrn.u16 d0, q0, #8 \n" "vshrn.u16 d1, q1, #8 \n" "vshrn.u16 d4, q2, #8 \n" "vshrn.u16 d5, q3, #8 \n" - "subs %2, %2, #8 \n" // 8 processed per loop. "vst1.8 {q0}, [%1]! \n" // store 4 pixels "vst1.8 {q2}, [%1]! \n" // store 4 pixels "bgt 1b \n" @@ -2712,11 +2706,11 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, "vld1.16 {q1}, [%0]! \n" "vld1.16 {q2}, [%0]! \n" "vld1.16 {q3}, [%0]! \n" + "subs %2, %2, #8 \n" // 8 processed per loop. "vtbl.8 d0, {d0, d1}, d8 \n" "vtbl.8 d1, {d2, d3}, d8 \n" "vtbl.8 d4, {d4, d5}, d8 \n" "vtbl.8 d5, {d6, d7}, d8 \n" - "subs %2, %2, #8 \n" // 8 processed per loop. "vst1.8 {q0}, [%1]! \n" // store 4 pixels "vst1.8 {q2}, [%1]! \n" // store 4 pixels "bgt 1b \n" @@ -3472,6 +3466,7 @@ void SobelXRow_NEON(const uint8_t* src_y0, "1: \n" "vld1.8 {d0}, [%0],%5 \n" // top "vld1.8 {d1}, [%0],%6 \n" + "subs %4, %4, #8 \n" // 8 pixels "vsubl.u8 q0, d0, d1 \n" "vld1.8 {d2}, [%1],%5 \n" // center * 2 "vld1.8 {d3}, [%1],%6 \n" @@ -3480,7 +3475,6 @@ void SobelXRow_NEON(const uint8_t* src_y0, "vadd.s16 q0, q0, q1 \n" "vld1.8 {d2}, [%2],%5 \n" // bottom "vld1.8 {d3}, [%2],%6 \n" - "subs %4, %4, #8 \n" // 8 pixels "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vabs.s16 q0, q0 \n" @@ -3510,6 +3504,7 @@ void SobelYRow_NEON(const uint8_t* src_y0, "1: \n" "vld1.8 {d0}, [%0],%4 \n" // left "vld1.8 {d1}, [%1],%4 \n" + "subs %3, %3, #8 \n" // 8 pixels "vsubl.u8 q0, d0, d1 \n" "vld1.8 {d2}, [%0],%4 \n" // center * 2 "vld1.8 {d3}, [%1],%4 \n" @@ -3518,7 +3513,6 @@ void SobelYRow_NEON(const uint8_t* src_y0, "vadd.s16 q0, q0, q1 \n" "vld1.8 {d2}, [%0],%5 \n" // right "vld1.8 {d3}, [%1],%5 \n" - "subs %3, %3, #8 \n" // 8 pixels "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vabs.s16 q0, q0 \n" @@ -3613,6 +3607,7 @@ void GaussCol_NEON(const uint16_t* src0, "1: \n" "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows "vld1.16 {q2}, [%4]! \n" + "subs %6, %6, #8 \n" // 8 processed per loop "vaddl.u16 q0, d2, d4 \n" // * 1 "vaddl.u16 q1, d3, d5 \n" // * 1 "vld1.16 {q2}, [%1]! \n" @@ -3624,7 +3619,6 @@ void GaussCol_NEON(const uint16_t* src0, "vld1.16 {q2}, [%3]! \n" "vmlal.u16 q0, d4, d6 \n" // * 4 "vmlal.u16 q1, d5, d6 \n" // * 4 - "subs %6, %6, #8 \n" // 8 processed per loop "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples "bgt 1b \n" : "+r"(src0), // %0 @@ -3650,6 +3644,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { "1: \n" "vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples "vld1.32 {q2}, [%0] \n" + "subs %5, %5, #8 \n" // 8 processed per loop "vadd.u32 q0, q0, q1 \n" // * 1 "vadd.u32 q1, q1, q2 \n" // * 1 "vld1.32 {q2, q3}, [%2]! \n" @@ -3661,7 +3656,6 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { "vadd.u32 q3, q3, q9 \n" "vmla.u32 q0, q2, q10 \n" // * 4 "vmla.u32 q1, q3, q10 \n" // * 4 - "subs %5, %5, #8 \n" // 8 processed per loop "vqshrn.u32 d0, q0, #8 \n" // round and pack "vqshrn.u32 d1, q1, #8 \n" "vst1.u16 {q0}, [%4]! \n" // store 8 samples @@ -3685,11 +3679,11 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, "1: \n" "vld1.8 {q2}, [%0]! \n" // load 16 Y values "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values + "subs %3, %3, #16 \n" // 16 pixels per loop "vmov d1, d0 \n" "vzip.u8 d0, d1 \n" // VV "vmov d3, d2 \n" "vzip.u8 d2, d3 \n" // UU - "subs %3, %3, #16 \n" // 16 pixels per loop "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels "vst3.8 {d1, d3, d5}, [%2]! \n" "bgt 1b \n" @@ -3711,6 +3705,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV // pixels. + "subs %3, %3, #16 \n" // 16 processed per loop. "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV @@ -3721,7 +3716,6 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vqrshrun.s16 d1, q0, #2 \n" // 2x2 average "vqrshrun.s16 d0, q1, #2 \n" - "subs %3, %3, #16 \n" // 16 processed per loop. "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV. "bgt 1b \n" : "+r"(src_ayuv), // %0 @@ -3742,6 +3736,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV // pixels. + "subs %3, %3, #16 \n" // 16 processed per loop. "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV @@ -3752,7 +3747,6 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average "vqrshrun.s16 d1, q1, #2 \n" - "subs %3, %3, #16 \n" // 16 processed per loop. "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU. "bgt 1b \n" : "+r"(src_ayuv), // %0 @@ -3786,8 +3780,8 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { "1: \n" "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values "vld2.8 {d1, d3}, [%0]! \n" - "vmov.u8 q2, q0 \n" // move U after V "subs %2, %2, #16 \n" // 16 pixels per loop + "vmov.u8 q2, q0 \n" // move U after V "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels "bgt 1b \n" : "+r"(src_uv), // %0 @@ -3811,13 +3805,13 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u, "vld1.8 {q1}, [%2]! \n" // load 16 V values "vld1.8 {q2}, [%1]! \n" "vld1.8 {q3}, [%3]! \n" + "subs %5, %5, #16 \n" // 16 src pixels per loop "vpaddl.u8 q0, q0 \n" // half size "vpaddl.u8 q1, q1 \n" "vpadal.u8 q0, q2 \n" "vpadal.u8 q1, q3 \n" "vqrshrn.u16 d0, q0, #2 \n" "vqrshrn.u16 d1, q1, #2 \n" - "subs %5, %5, #16 \n" // 16 src pixels per loop "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels "bgt 1b \n" : "+r"(src_u), // %0 @@ -3840,9 +3834,9 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv, "vdup.16 q2, %4 \n" "1: \n" "vld2.16 {q0, q1}, [%0]! \n" // load 8 UV + "subs %3, %3, #8 \n" // 8 src pixels per loop "vshl.u16 q0, q0, q2 \n" "vshl.u16 q1, q1, q2 \n" - "subs %3, %3, #8 \n" // 8 src pixels per loop "vst1.16 {q0}, [%1]! \n" // store 8 U pixels "vst1.16 {q1}, [%2]! \n" // store 8 V pixels "bgt 1b \n" @@ -3865,9 +3859,9 @@ void MergeUVRow_16_NEON(const uint16_t* src_u, "1: \n" "vld1.16 {q0}, [%0]! \n" // load 8 U "vld1.16 {q1}, [%1]! \n" // load 8 V + "subs %3, %3, #8 \n" // 8 src pixels per loop "vshl.u16 q0, q0, q2 \n" "vshl.u16 q1, q1, q2 \n" - "subs %3, %3, #8 \n" // 8 src pixels per loop "vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels "bgt 1b \n" : "+r"(src_u), // %0 @@ -3887,11 +3881,11 @@ void MultiplyRow_16_NEON(const uint16_t* src_y, "1: \n" "vld1.16 {q0}, [%0]! \n" "vld1.16 {q1}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 src pixels per loop "vmul.u16 q0, q0, q2 \n" "vmul.u16 q1, q1, q2 \n" "vst1.16 {q0}, [%1]! \n" "vst1.16 {q1}, [%1]! \n" - "subs %2, %2, #16 \n" // 16 src pixels per loop "bgt 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 @@ -3908,6 +3902,7 @@ void DivideRow_16_NEON(const uint16_t* src_y, "vdup.16 d8, %3 \n" "1: \n" "vld1.16 {q2, q3}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 src pixels per loop "vmull.u16 q0, d4, d8 \n" "vmull.u16 q1, d5, d8 \n" "vmull.u16 q2, d6, d8 \n" @@ -3917,7 +3912,6 @@ void DivideRow_16_NEON(const uint16_t* src_y, "vshrn.u32 d2, q2, #16 \n" "vshrn.u32 d3, q3, #16 \n" "vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels - "subs %2, %2, #16 \n" // 16 src pixels per loop "bgt 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 @@ -3941,11 +3935,11 @@ void Convert16To8Row_NEON(const uint16_t* src_y, "1: \n" "vld1.16 {q0}, [%0]! \n" "vld1.16 {q1}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 src pixels per loop "vshl.u16 q0, q0, q2 \n" // shr = q2 is negative "vshl.u16 q1, q1, q2 \n" "vqmovn.u16 d0, q0 \n" "vqmovn.u16 d1, q1 \n" - "subs %2, %2, #16 \n" // 16 src pixels per loop "vst1.8 {q0}, [%1]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -3955,6 +3949,41 @@ void Convert16To8Row_NEON(const uint16_t* src_y, : "cc", "memory", "q0", "q1", "q2"); } +// Use scale to convert J420 to I420 +// scale parameter is 8.8 fixed point but limited to 0 to 255 +// Function is based on DivideRow, but adds a bias +// Does not clamp +void Convert8To8Row_NEON(const uint8_t* src_y, + uint8_t* dst_y, + int scale, + int bias, + int width) { + asm volatile( + "vdup.8 d8, %3 \n" + "vdup.8 q5, %4 \n" + "1: \n" + "vld1.8 {q2, q3}, [%0]! \n" + "subs %2, %2, #32 \n" // 32 src pixels per loop + "vmull.u8 q0, d4, d8 \n" + "vmull.u8 q1, d5, d8 \n" + "vmull.u8 q2, d6, d8 \n" + "vmull.u8 q3, d7, d8 \n" + "vshrn.u16 d0, q0, #8 \n" + "vshrn.u16 d1, q1, #8 \n" + "vshrn.u16 d2, q2, #8 \n" + "vshrn.u16 d3, q3, #8 \n" + "vadd.u8 q0, q0, q5 \n" + "vadd.u8 q1, q1, q5 \n" + "vst1.8 {q0, q1}, [%1]! \n" // store 32 pixels + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale), // %3 + "r"(bias) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", "d8", "q5"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. #ifdef __cplusplus diff --git a/source/row_neon64.cc b/source/row_neon64.cc index dc4ca2417..97afa6828 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -242,9 +242,8 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" /* A */ - "1: \n" READYUV444 I4XXTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "1: \n" READYUV444 + "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -265,9 +264,8 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV444 I4XXTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "1: \n" READYUV444 + "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8 "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -293,8 +291,8 @@ void I210ToAR30Row_NEON(const uint16_t* src_y, asm volatile(YUVTORGB_SETUP "dup v22.8h, %w[limit] \n" "dup v23.8h, %w[alpha] \n" - "1: \n" READYUV210 NVTORGB - "subs %w[width], %w[width], #8 \n" STOREAR30 + "1: \n" READYUV210 + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] @@ -321,8 +319,8 @@ void I410ToAR30Row_NEON(const uint16_t* src_y, asm volatile(YUVTORGB_SETUP "dup v22.8h, %w[limit] \n" "dup v23.8h, %w[alpha] \n" - "1: \n" READYUV410 NVTORGB - "subs %w[width], %w[width], #8 \n" STOREAR30 + "1: \n" READYUV410 + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] @@ -348,8 +346,8 @@ void I212ToAR30Row_NEON(const uint16_t* src_y, asm volatile(YUVTORGB_SETUP "dup v22.8h, %w[limit] \n" "movi v23.8h, #0xc0, lsl #8 \n" // A - "1: \n" READYUV212 NVTORGB - "subs %w[width], %w[width], #8 \n" STOREAR30 + "1: \n" READYUV212 + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] @@ -371,9 +369,8 @@ void I210ToARGBRow_NEON(const uint16_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" - "1: \n" READYUV210 NVTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "1: \n" READYUV210 + "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -395,9 +392,8 @@ void I410ToARGBRow_NEON(const uint16_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" - "1: \n" READYUV410 NVTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "1: \n" READYUV410 + "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -421,9 +417,8 @@ void I212ToARGBRow_NEON(const uint16_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" - "1: \n" READYUV212 NVTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "1: \n" READYUV212 + "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -445,9 +440,8 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" /* A */ - "1: \n" READYUV422 I4XXTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "1: \n" READYUV422 + "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -475,8 +469,8 @@ void P210ToARGBRow_NEON(const uint16_t* src_y, "movi v19.8b, #255 \n" "ldr q2, [%[kIndices]] \n" "1: \n" // - READYUVP210 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + READYUVP210 + "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -504,8 +498,8 @@ void P410ToARGBRow_NEON(const uint16_t* src_y, "movi v19.8b, #255 \n" "ldr q2, [%[kIndices]] \n" "1: \n" // - READYUVP410 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + READYUVP410 + "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -526,23 +520,22 @@ void P210ToAR30Row_NEON(const uint16_t* src_y, const uvec8* uv_coeff = &yuvconstants->kUVCoeff; const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const uint16_t limit = 0x3ff0; - asm volatile( - YUVTORGB_SETUP + asm volatile(YUVTORGB_SETUP "dup v22.8h, %w[limit] \n" "movi v23.8h, #0xc0, lsl #8 \n" // A "ldr q2, [%[kIndices]] \n" - "1: \n" READYUVP210 NVTORGB - "subs %w[width], %w[width], #8 \n" STOREAR30 + "1: \n" READYUVP210 + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] - [limit] "r"(limit), // %[limit] - [kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices] - : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] + [limit] "r"(limit), // %[limit] + [kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices] + : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); } void P410ToAR30Row_NEON(const uint16_t* src_y, @@ -553,23 +546,22 @@ void P410ToAR30Row_NEON(const uint16_t* src_y, const uvec8* uv_coeff = &yuvconstants->kUVCoeff; const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; uint16_t limit = 0x3ff0; - asm volatile( - YUVTORGB_SETUP + asm volatile(YUVTORGB_SETUP "dup v22.8h, %w[limit] \n" "movi v23.8h, #0xc0, lsl #8 \n" // A "ldr q2, [%[kIndices]] \n" - "1: \n" READYUVP410 NVTORGB - "subs %w[width], %w[width], #8 \n" STOREAR30 + "1: \n" READYUVP410 + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] - [limit] "r"(limit), // %[limit] - [kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices] - : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] + [limit] "r"(limit), // %[limit] + [kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices] + : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); } void I422ToAR30Row_NEON(const uint8_t* src_y, @@ -585,8 +577,8 @@ void I422ToAR30Row_NEON(const uint8_t* src_y, YUVTORGB_SETUP "dup v22.8h, %w[limit] \n" "movi v23.8h, #0xc0, lsl #8 \n" // A - "1: \n" READYUV422 I4XXTORGB - "subs %w[width], %w[width], #8 \n" STOREAR30 + "1: \n" READYUV422 + "subs %w[width], %w[width], #8 \n" I4XXTORGB STOREAR30 "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] @@ -610,8 +602,8 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y, YUVTORGB_SETUP "1: \n" "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444 - "prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" + "prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -636,8 +628,8 @@ void I410AlphaToARGBRow_NEON(const uint16_t* src_y, YUVTORGB_SETUP "1: \n" "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV410 - "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" + "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -662,8 +654,8 @@ void I210AlphaToARGBRow_NEON(const uint16_t* src_y, YUVTORGB_SETUP "1: \n" "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV210 - "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" + "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -688,8 +680,8 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y, YUVTORGB_SETUP "1: \n" "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422 - "prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" + "prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -712,9 +704,8 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "movi v15.8b, #255 \n" /* A */ - "1: \n" READYUV422 I4XXTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "1: \n" READYUV422 + "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8 "st4 {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -735,9 +726,8 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 I4XXTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "1: \n" READYUV422 + "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8 "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -777,9 +767,9 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 I4XXTORGB - RGBTORGB8_TOP - "subs %w[width], %w[width], #8 \n" ARGBTORGB565_FROM_TOP + "1: \n" READYUV422 + "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8_TOP + ARGBTORGB565_FROM_TOP "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 pixels RGB565. "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -818,10 +808,9 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8h, #0x80, lsl #8 \n" - "1: \n" // - READYUV422 I4XXTORGB RGBTORGB8_TOP - "subs %w[width], %w[width], #8 \n" // - ARGBTOARGB1555_FROM_TOP + "1: \n" // + READYUV422 "subs %w[width], %w[width], #8 \n" // + I4XXTORGB RGBTORGB8_TOP ARGBTOARGB1555_FROM_TOP "st1 {v19.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels // RGB1555. "b.gt 1b \n" @@ -849,9 +838,8 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 I4XXTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "1: \n" READYUV422 + "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8 "movi v19.8b, #255 \n" ARGBTOARGB4444 "st1 {v0.8h}, [%[dst_argb4444]], #16 \n" // store 8 // pixels @@ -880,8 +868,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, "umull v4.8h, v1.8b, v28.8b \n" /* DB */ "umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ "1: \n" READYUV400 I400TORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "subs %w[width], %w[width], #8 \n" RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -898,10 +885,10 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { "movi v23.8b, #255 \n" "1: \n" "ld1 {v20.8b}, [%0], #8 \n" + "subs %w2, %w2, #8 \n" "prfm pldl1keep, [%0, 448] \n" "mov v21.8b, v20.8b \n" "mov v22.8b, v20.8b \n" - "subs %w2, %w2, #8 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -941,8 +928,8 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "1: \n" READNV12 + "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -964,8 +951,8 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "1: \n" READNV12 + "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -986,8 +973,8 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "1: \n" READNV12 + "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -1008,8 +995,8 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "1: \n" READNV12 + "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -1030,9 +1017,9 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 NVTORGB - RGBTORGB8_TOP - "subs %w[width], %w[width], #8 \n" ARGBTORGB565_FROM_TOP + "1: \n" READNV12 + "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8_TOP + ARGBTORGB565_FROM_TOP "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 // pixels // RGB565. @@ -1055,8 +1042,8 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV21InterleavedTable]] \n" - "1: \n" READYUY2 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "1: \n" READYUY2 + "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2] @@ -1076,8 +1063,8 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV12InterleavedTable]] \n" - "1: \n" READUYVY NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "1: \n" READUYVY + "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_uyvy] "+r"(src_uyvy), // %[src_yuy2] @@ -1188,10 +1175,10 @@ void DetileToYUY2_NEON(const uint8_t* src_y, asm volatile( "1: \n" "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys + "subs %w3, %w3, #16 \n" // store 8 YUY2 "prfm pldl1keep, [%0, 1792] \n" "ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs "prfm pldl1keep, [%1, 1792] \n" - "subs %w3, %w3, #16 \n" // store 8 YUY2 "st2 {v0.16b,v1.16b}, [%2], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -1240,10 +1227,10 @@ void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { "1: \n" "ld1 {v7.16b}, [%0], #16 \n" "ld1 {v0.16b-v3.16b}, [%0], #64 \n" + "subs %2, %2, #80 \n" "shl v4.16b, v7.16b, #6 \n" "shl v5.16b, v7.16b, #4 \n" "shl v6.16b, v7.16b, #2 \n" - "subs %2, %2, #80 \n" "zip1 v16.16b, v4.16b, v0.16b \n" "zip1 v18.16b, v5.16b, v1.16b \n" "zip1 v20.16b, v6.16b, v2.16b \n" @@ -1305,8 +1292,8 @@ void MergeUVRow_16_NEON(const uint16_t* src_u, "dup v2.8h, %w4 \n" "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // load 8 U - "subs %w3, %w3, #8 \n" // 8 src pixels per loop "ld1 {v1.8h}, [%1], #16 \n" // load 8 V + "subs %w3, %w3, #8 \n" // 8 src pixels per loop "ushl v0.8h, v0.8h, v2.8h \n" "prfm pldl1keep, [%0, 448] \n" "ushl v1.8h, v1.8h, v2.8h \n" @@ -1356,8 +1343,8 @@ void MergeUVRow_16_NEON(const uint16_t* src_u, "dup v4.8h, %w4 \n" "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // load 8 U - "subs %w3, %w3, #8 \n" // 8 src pixels per loop "ld1 {v1.8h}, [%1], #16 \n" // load 8 V + "subs %w3, %w3, #8 \n" // 8 src pixels per loop "ushl v0.8h, v0.8h, v4.8h \n" "ushl v1.8h, v1.8h, v4.8h \n" "prfm pldl1keep, [%0, 448] \n" @@ -1596,6 +1583,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r, "ldr d2, [%2], #8 \n" // B "ldr d1, [%1], #8 \n" // G "ldr d0, [%0], #8 \n" // R + "subs %w4, %w4, #4 \n" "ushll v2.4s, v2.4h, #0 \n" // B "ushll v1.4s, v1.4h, #0 \n" // G "ushll v0.4s, v0.4h, #0 \n" // R @@ -1608,7 +1596,6 @@ void MergeXR30Row_NEON(const uint16_t* src_r, "sli v2.4s, v1.4s, #10 \n" // 00GB "sli v2.4s, v0.4s, #20 \n" // 0RGB "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30) - "subs %w4, %w4, #4 \n" "str q2, [%3], #16 \n" "b.gt 1b \n" : "+r"(src_r), // %0 @@ -1637,6 +1624,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r, "ldr q0, [%0], #16 \n" // xxxxxxRrrrrrrrrr "ldr q1, [%1], #16 \n" // xxxxxxGggggggggg "ldr q2, [%2], #16 \n" // xxxxxxBbbbbbbbbb + "subs %w4, %w4, #8 \n" "umin v0.8h, v0.8h, v5.8h \n" // 000000Rrrrrrrrrr "umin v1.8h, v1.8h, v5.8h \n" // 000000Gggggggggg "movi v4.8h, #0xc0, lsl #8 \n" // 1100000000000000 @@ -1644,7 +1632,6 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r, "mla v4.8h, v0.8h, v6.8h \n" // 11Rrrrrrrrrr0000 "mla v3.8h, v1.8h, v7.8h \n" // ggggggBbbbbbbbbb "usra v4.8h, v1.8h, #6 \n" // 11RrrrrrrrrrGggg - "subs %w4, %w4, #8 \n" "st2 {v3.8h, v4.8h}, [%3], #32 \n" "b.gt 1b \n" : "+r"(src_r), // %0 @@ -1674,6 +1661,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r, "ldr q1, [%1], #16 \n" // G "ldr q0, [%2], #16 \n" // B "ldr q3, [%3], #16 \n" // A + "subs %w5, %w5, #8 \n" "umin v2.8h, v2.8h, v30.8h \n" "prfm pldl1keep, [%0, 448] \n" "umin v1.8h, v1.8h, v30.8h \n" @@ -1686,7 +1674,6 @@ void MergeAR64Row_NEON(const uint16_t* src_r, "ushl v1.8h, v1.8h, v31.8h \n" "ushl v0.8h, v0.8h, v31.8h \n" "ushl v3.8h, v3.8h, v31.8h \n" - "subs %w5, %w5, #8 \n" "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n" "b.gt 1b \n" : "+r"(src_r), // %0 @@ -1718,6 +1705,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r, "ldr q2, [%0], #16 \n" // R "ldr q1, [%1], #16 \n" // G "ldr q0, [%2], #16 \n" // B + "subs %w4, %w4, #8 \n" "umin v2.8h, v2.8h, v30.8h \n" "prfm pldl1keep, [%0, 448] \n" "umin v1.8h, v1.8h, v30.8h \n" @@ -1727,7 +1715,6 @@ void MergeXR64Row_NEON(const uint16_t* src_r, "ushl v2.8h, v2.8h, v31.8h \n" "ushl v1.8h, v1.8h, v31.8h \n" "ushl v0.8h, v0.8h, v31.8h \n" - "subs %w4, %w4, #8 \n" "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n" "b.gt 1b \n" : "+r"(src_r), // %0 @@ -1756,6 +1743,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r, "ldr q1, [%1], #16 \n" // G "ldr q2, [%2], #16 \n" // R "ldr q3, [%3], #16 \n" // A + "subs %w5, %w5, #8 \n" "uqshl v0.8h, v0.8h, v31.8h \n" "prfm pldl1keep, [%0, 448] \n" "uqshl v1.8h, v1.8h, v31.8h \n" @@ -1766,7 +1754,6 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r, "prfm pldl1keep, [%3, 448] \n" "trn2 v0.16b, v0.16b, v1.16b \n" "trn2 v1.16b, v2.16b, v3.16b \n" - "subs %w5, %w5, #8 \n" "st2 {v0.8h, v1.8h}, [%4], #32 \n" "b.gt 1b \n" : "+r"(src_b), // %0 @@ -1794,6 +1781,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r, "ldr q0, [%0], #16 \n" // B "ldr q1, [%1], #16 \n" // G "ldr q2, [%2], #16 \n" // R + "subs %w4, %w4, #8 \n" "uqshl v0.8h, v0.8h, v31.8h \n" "prfm pldl1keep, [%0, 448] \n" "uqshl v1.8h, v1.8h, v31.8h \n" @@ -1802,7 +1790,6 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r, "prfm pldl1keep, [%2, 448] \n" "trn2 v0.16b, v0.16b, v1.16b \n" "trn2 v1.16b, v2.16b, v3.16b \n" - "subs %w4, %w4, #8 \n" "st2 {v0.8h, v1.8h}, [%3], #32 \n" "b.gt 1b \n" : "+r"(src_b), // %0 @@ -1994,8 +1981,8 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, "1: \n" "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of // RGB24. + "subs %w2, %w2, #8 \n" // 8 processed per loop. "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_rgb24), // %0 @@ -2471,11 +2458,11 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, asm volatile( "1: \n" "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys + "subs %w4, %w4, #16 \n" // 16 pixels "mov v3.8b, v2.8b \n" "prfm pldl1keep, [%0, 448] \n" "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs - "subs %w4, %w4, #16 \n" // 16 pixels "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. "b.gt 1b \n" : "+r"(src_y), // %0 @@ -2569,10 +2556,10 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb, asm volatile( "1: \n" "ldp q0, q2, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. "mov v1.16b, v0.16b \n" "prfm pldl1keep, [%0, 448] \n" "mov v3.16b, v2.16b \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels "b.gt 1b \n" @@ -2593,12 +2580,12 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb, "ldr q4, [%3] \n" // shuffler "1: \n" "ldp q0, q2, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. "tbl v0.16b, {v0.16b}, v4.16b \n" "tbl v2.16b, {v2.16b}, v4.16b \n" "prfm pldl1keep, [%0, 448] \n" "mov v1.16b, v0.16b \n" "mov v3.16b, v2.16b \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels "b.gt 1b \n" @@ -2668,10 +2655,10 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64, "1: \n" "ldp q0, q1, [%0], #32 \n" // load 4 pixels "ldp q2, q3, [%0], #32 \n" // load 4 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" "prfm pldl1keep, [%0, 448] \n" "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. "stp q0, q2, [%1], #32 \n" // store 8 pixels "b.gt 1b \n" : "+r"(src_ar64), // %0 @@ -2692,10 +2679,10 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, "1: \n" "ldp q0, q1, [%0], #32 \n" // load 4 pixels "ldp q2, q3, [%0], #32 \n" // load 4 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" "prfm pldl1keep, [%0, 448] \n" "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. "stp q0, q2, [%1], #32 \n" // store 8 pixels "b.gt 1b \n" : "+r"(src_ab64), // %0 @@ -2786,6 +2773,7 @@ static void ARGBToUV444MatrixRow_NEON_I8MM( "movi v29.16b, #0x80 \n" // 128.5 "1: \n" "ldp q0, q1, [%[src]], #32 \n" + "subs %w[width], %w[width], #8 \n" // 8 processed per loop. "movi v2.4s, #0 \n" "movi v3.4s, #0 \n" "movi v4.4s, #0 \n" @@ -2795,7 +2783,6 @@ static void ARGBToUV444MatrixRow_NEON_I8MM( "usdot v4.4s, v0.16b, v17.16b \n" "usdot v5.4s, v1.16b, v17.16b \n" "prfm pldl1keep, [%[src], 448] \n" - "subs %w[width], %w[width], #8 \n" // 8 processed per loop. "uzp1 v0.8h, v2.8h, v3.8h \n" "uzp1 v1.8h, v4.8h, v5.8h \n" "addhn v0.8b, v0.8h, v29.8h \n" // +128 -> unsigned @@ -2877,6 +2864,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. @@ -2892,7 +2880,6 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, "urshr v1.8h, v1.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. @@ -2924,6 +2911,7 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. @@ -2938,7 +2926,6 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, "urshr v1.8h, v1.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. @@ -2969,6 +2956,7 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr, "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. @@ -2983,7 +2971,6 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr, "urshr v1.8h, v1.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v2.8h, v1.8h, v0.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. @@ -3014,6 +3001,7 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. + "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. @@ -3028,7 +3016,6 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, "urshr v1.8h, v1.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. @@ -3059,6 +3046,7 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw, "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. + "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. @@ -3073,7 +3061,6 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw, "urshr v1.8h, v1.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v2.8h, v1.8h, v0.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. @@ -3099,6 +3086,7 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. @@ -3113,7 +3101,6 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, "urshr v1.8h, v3.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. @@ -3139,6 +3126,7 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. @@ -3153,7 +3141,6 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, "urshr v2.8h, v2.8h, #1 \n" "urshr v1.8h, v1.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v2.8h, v1.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. @@ -3179,6 +3166,7 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. @@ -3193,7 +3181,6 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, "urshr v1.8h, v1.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. @@ -3219,6 +3206,7 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, RGBTOUV_SETUP_REG "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. + "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. @@ -3233,7 +3221,6 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, "urshr v1.8h, v1.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. @@ -3259,6 +3246,7 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, RGBTOUV_SETUP_REG "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RAW pixels. + "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. @@ -3273,7 +3261,6 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, "urshr v1.8h, v1.8h, #1 \n" "urshr v0.8h, v0.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v2.8h, v1.8h, v0.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. @@ -3300,6 +3287,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, RGBTOUV_SETUP_REG "1: \n" "ldp q0, q4, [%0], #32 \n" // load 16 RGB565 pixels. + "subs %w4, %w4, #16 \n" // 16 processed per loop. RGB565TOARGB "uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" @@ -3317,7 +3305,6 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, "urshr v1.8h, v17.8h, #1 \n" "urshr v2.8h, v18.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. @@ -3344,6 +3331,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, RGBTOUV_SETUP_REG "1: \n" "ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 pixels. + "subs %w4, %w4, #16 \n" // 16 processed per loop. RGB555TOARGB "uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" @@ -3361,7 +3349,6 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, "urshr v1.8h, v17.8h, #1 \n" "urshr v2.8h, v18.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. @@ -3388,6 +3375,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, RGBTOUV_SETUP_REG // sets v20-v25 "1: \n" "ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444 pixels. + "subs %w4, %w4, #16 \n" // 16 processed per loop. ARGB4444TORGB "uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" @@ -3405,7 +3393,6 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, "urshr v1.8h, v17.8h, #1 \n" "urshr v2.8h, v18.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. @@ -4210,9 +4197,9 @@ void ARGBGrayRow_NEON_DotProd(const uint8_t* src_argb, "ldr q25, [%[indices]] \n" "1: \n" "ldp q1, q3, [%[src]], #32 \n" // load 8 ARGB + "subs %w[width], %w[width], #8 \n" // 8 processed per loop "movi v0.4s, #0 \n" "movi v2.4s, #0 \n" - "subs %w[width], %w[width], #8 \n" // 8 processed per loop "udot v0.4s, v1.16b, v24.16b \n" "udot v2.4s, v3.16b, v24.16b \n" "prfm pldl1keep, [%[src], 448] \n" @@ -4281,6 +4268,7 @@ void ARGBSepiaRow_NEON_DotProd(uint8_t* dst_argb, int width) { "ldr d23, [%[indices]] \n" "1: \n" "ldp q0, q1, [%[dst]] \n" + "subs %w1, %w1, #8 \n" "movi v2.4s, #0 \n" "movi v3.4s, #0 \n" "movi v4.4s, #0 \n" @@ -4293,7 +4281,6 @@ void ARGBSepiaRow_NEON_DotProd(uint8_t* dst_argb, int width) { "udot v5.4s, v1.16b, v21.16b \n" "udot v6.4s, v0.16b, v22.16b \n" "udot v7.4s, v1.16b, v22.16b \n" - "subs %w1, %w1, #8 \n" "prfm pldl1keep, [%[dst], 448] \n" "uzp1 v6.8h, v6.8h, v7.8h \n" "uzp1 v5.8h, v4.8h, v5.8h \n" @@ -4383,6 +4370,7 @@ void ARGBColorMatrixRow_NEON_I8MM(const uint8_t* src_argb, "1: \n" "ld1 {v0.16b, v1.16b}, [%[src_argb]], #32 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. "movi v16.4s, #0 \n" "movi v17.4s, #0 \n" @@ -4393,8 +4381,6 @@ void ARGBColorMatrixRow_NEON_I8MM(const uint8_t* src_argb, "movi v22.4s, #0 \n" "movi v23.4s, #0 \n" - // 8 processed per loop. - "subs %w2, %w2, #8 \n" "prfm pldl1keep, [%[src_argb], 448] \n" "sudot v16.4s, v31.16b, v0.4b[0] \n" @@ -4609,6 +4595,7 @@ void SobelXRow_NEON(const uint8_t* src_y0, "1: \n" "ld1 {v0.8b}, [%0],%5 \n" // top "ld1 {v1.8b}, [%0],%6 \n" + "subs %w4, %w4, #8 \n" // 8 pixels "usubl v0.8h, v0.8b, v1.8b \n" "prfm pldl1keep, [%0, 448] \n" "ld1 {v2.8b}, [%1],%5 \n" // center * 2 @@ -4619,7 +4606,6 @@ void SobelXRow_NEON(const uint8_t* src_y0, "add v0.8h, v0.8h, v1.8h \n" "ld1 {v2.8b}, [%2],%5 \n" // bottom "ld1 {v3.8b}, [%2],%6 \n" - "subs %w4, %w4, #8 \n" // 8 pixels "prfm pldl1keep, [%2, 448] \n" "usubl v1.8h, v2.8b, v3.8b \n" "add v0.8h, v0.8h, v1.8h \n" @@ -4650,6 +4636,7 @@ void SobelYRow_NEON(const uint8_t* src_y0, "1: \n" "ld1 {v0.8b}, [%0],%4 \n" // left "ld1 {v1.8b}, [%1],%4 \n" + "subs %w3, %w3, #8 \n" // 8 pixels "usubl v0.8h, v0.8b, v1.8b \n" "ld1 {v2.8b}, [%0],%4 \n" // center * 2 "ld1 {v3.8b}, [%1],%4 \n" @@ -4658,7 +4645,6 @@ void SobelYRow_NEON(const uint8_t* src_y0, "add v0.8h, v0.8h, v1.8h \n" "ld1 {v2.8b}, [%0],%5 \n" // right "ld1 {v3.8b}, [%1],%5 \n" - "subs %w3, %w3, #8 \n" // 8 pixels "usubl v1.8h, v2.8b, v3.8b \n" "prfm pldl1keep, [%0, 448] \n" "add v0.8h, v0.8h, v1.8h \n" @@ -4881,8 +4867,8 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { asm volatile( "1: \n" "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop + "prfm pldl1keep, [%0, 448] \n" "fmul v1.4s, v1.4s, %3.s[0] \n" // scale "fmul v2.4s, v2.4s, %3.s[0] \n" // scale "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples @@ -4909,6 +4895,7 @@ void GaussCol_NEON(const uint16_t* src0, "1: \n" "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows "ld1 {v2.8h}, [%4], #16 \n" + "subs %w6, %w6, #8 \n" // 8 processed per loop "uaddl v0.4s, v1.4h, v2.4h \n" // * 1 "prfm pldl1keep, [%0, 448] \n" "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1 @@ -4924,7 +4911,6 @@ void GaussCol_NEON(const uint16_t* src0, "umlal v0.4s, v2.4h, v6.4h \n" // * 4 "prfm pldl1keep, [%3, 448] \n" "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 - "subs %w6, %w6, #8 \n" // 8 processed per loop "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples "prfm pldl1keep, [%4, 448] \n" "b.gt 1b \n" @@ -4950,6 +4936,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { "1: \n" "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples + "subs %w5, %w5, #8 \n" // 8 processed per loop "add v0.4s, v0.4s, v1.4s \n" // * 1 "add v1.4s, v1.4s, v2.4s \n" // * 1 "ld1 {v2.4s,v3.4s}, [%2], #32 \n" @@ -4962,7 +4949,6 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { "prfm pldl1keep, [%0, 448] \n" "mla v0.4s, v2.4s, v6.4s \n" // * 4 "mla v1.4s, v3.4s, v6.4s \n" // * 4 - "subs %w5, %w5, #8 \n" // 8 processed per loop "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack "uqrshrn2 v0.8h, v1.4s, #8 \n" "st1 {v0.8h}, [%4], #16 \n" // store 8 samples @@ -4993,6 +4979,7 @@ void GaussCol_F32_NEON(const float* src0, "1: \n" "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows "ld1 {v2.4s, v3.4s}, [%1], #32 \n" + "subs %w6, %w6, #8 \n" // 8 processed per loop "fmla v0.4s, v2.4s, v6.4s \n" // * 4 "ld1 {v4.4s, v5.4s}, [%2], #32 \n" "fmla v1.4s, v3.4s, v6.4s \n" @@ -5009,7 +4996,6 @@ void GaussCol_F32_NEON(const float* src0, "prfm pldl1keep, [%3, 448] \n" "fadd v1.4s, v1.4s, v5.4s \n" "prfm pldl1keep, [%4, 448] \n" - "subs %w6, %w6, #8 \n" // 8 processed per loop "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples "b.gt 1b \n" : "+r"(src0), // %0 @@ -5031,6 +5017,7 @@ void GaussRow_F32_NEON(const float* src, float* dst, int width) { "1: \n" "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5 // rows + "subs %w2, %w2, #8 \n" // 8 processed per loop "fadd v0.4s, v0.4s, v1.4s \n" // * 1 "ld1 {v4.4s, v5.4s}, [%0], %5 \n" "fadd v1.4s, v1.4s, v2.4s \n" @@ -5045,7 +5032,6 @@ void GaussRow_F32_NEON(const float* src, float* dst, int width) { "prfm pldl1keep, [%0, 448] \n" "fmul v0.4s, v0.4s, v8.4s \n" // / 256 "fmul v1.4s, v1.4s, v8.4s \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples "b.gt 1b \n" : "+r"(src), // %0 @@ -5068,11 +5054,11 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, "1: \n" "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values + "subs %w3, %w3, #16 \n" // 16 pixels per loop "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values "prfm pldl1keep, [%0, 448] \n" "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values "prfm pldl1keep, [%1, 448] \n" - "subs %w3, %w3, #16 \n" // 16 pixels per loop "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels "b.gt 1b \n" : "+r"(src_y), // %0 @@ -5100,12 +5086,12 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values "ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values + "subs %w3, %w3, #16 \n" // 16 pixels per loop "tbl v2.16b, {v0.16b,v1.16b}, v5.16b \n" // weave into YUV24 "prfm pldl1keep, [%0, 448] \n" "tbl v3.16b, {v0.16b,v1.16b}, v6.16b \n" "prfm pldl1keep, [%1, 448] \n" "tbl v4.16b, {v0.16b,v1.16b}, v7.16b \n" - "subs %w3, %w3, #16 \n" // 16 pixels per loop "st1 {v2.16b,v3.16b,v4.16b}, [%2], #48 \n" // store 16 YUV pixels "b.gt 1b \n" : "+r"(src_y), // %0 @@ -5129,6 +5115,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv + "subs %w3, %w3, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. @@ -5138,7 +5125,6 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, "prfm pldl1keep, [%1, 448] \n" "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average "uqrshrn v2.8b, v1.8h, #2 \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop. "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV. "b.gt 1b \n" : "+r"(src_ayuv), // %0 @@ -5158,6 +5144,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv + "subs %w3, %w3, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. @@ -5167,7 +5154,6 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, "prfm pldl1keep, [%1, 448] \n" "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average "uqrshrn v1.8b, v1.8h, #2 \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop. "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. "b.gt 1b \n" : "+r"(src_ayuv), // %0 @@ -5227,6 +5213,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u, "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values "ld1 {v2.16b}, [%1], #16 \n" "ld1 {v3.16b}, [%3], #16 \n" + "subs %w5, %w5, #16 \n" // 16 src pixels per loop "uaddlp v0.8h, v0.16b \n" // half size "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" @@ -5237,7 +5224,6 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u, "prfm pldl1keep, [%3, 448] \n" "uqrshrn v0.8b, v0.8h, #2 \n" "uqrshrn v1.8b, v1.8h, #2 \n" - "subs %w5, %w5, #16 \n" // 16 src pixels per loop "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels "b.gt 1b \n" : "+r"(src_u), // %0 @@ -5376,7 +5362,7 @@ void Convert8To8Row_NEON(const uint8_t* src_y, "uzp2 v1.16b, v2.16b, v3.16b \n" "add v0.16b, v0.16b, v5.16b \n" // add bias (16) "add v1.16b, v1.16b, v5.16b \n" - "stp q0, q1, [%1], #32 \n" // store 16 pixels + "stp q0, q1, [%1], #32 \n" // store 32 pixels "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1