diff --git a/source/convert_argb.cc b/source/convert_argb.cc index dd94c1b77..a77358df8 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -3792,7 +3792,7 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555, #if defined(HAS_ARGB1555TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON; } } diff --git a/source/row_any.cc b/source/row_any.cc index b9eaa0a3c..d2f8a5419 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1393,7 +1393,7 @@ ANY11(RGB565ToARGBRow_Any_LSX, RGB565ToARGBRow_LSX, 0, 2, 4, 15) ANY11(RGB565ToARGBRow_Any_LASX, RGB565ToARGBRow_LASX, 0, 2, 4, 31) #endif #ifdef HAS_ARGB1555TOARGBROW_NEON -ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7) +ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 15) #endif #ifdef HAS_ARGB1555TOARGBROW_MSA ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15) diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 6d50b2760..7ad54b430 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2083,17 +2083,19 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, ); } -#define ARGB1555TOARGB \ - /* Input: ARRRRRGGGGGBBBBB */ \ - "xtn v29.8b, v0.8h \n" /* xxxBBBBB */ \ - "shrn v3.8b, v0.8h, #8 \n" /* Axxxxxxx */ \ - "shrn v2.8b, v0.8h, #7 \n" /* RRRRRxxx */ \ - "shrn v1.8b, v0.8h, #2 \n" /* GGGGGxxx */ \ - "shl v0.8b, v29.8b, #3 \n" /* BBBBB000 */ \ - "sshr v3.8b, v3.8b, #7 \n" /* AAAAAAAA */ \ - "sri v2.8b, v2.8b, #5 \n" /* RRRRRRRR */ \ - "sri v1.8b, v1.8b, #5 \n" /* GGGGGGGG */ \ - "sri v0.8b, v0.8b, #5 \n" /* BBBBBBBB */ +#define ARGB1555TOARGB \ + /* Input: ARRRRRGGGGGBBBBB */ \ + "shrn v2.8b, v0.8h, #7 \n" /* RRRRRxxx */ \ + "uzp1 v29.16b, v0.16b, v4.16b \n" /* xxxBBBBB */ \ + "shrn v1.8b, v0.8h, #2 \n" /* GGGGGxxx */ \ + "uzp2 v3.16b, v0.16b, v4.16b \n" /* Axxxxxxx */ \ + "shrn2 v2.16b, v4.8h, #7 \n" /* RRRRRxxx */ \ + "shl v0.16b, v29.16b, #3 \n" /* BBBBB000 */ \ + "shrn2 v1.16b, v4.8h, #2 \n" /* GGGGGxxx */ \ + "sshr v3.16b, v3.16b, #7 \n" /* AAAAAAAA */ \ + "sri v2.16b, v2.16b, #5 \n" /* RRRRRRRR */ \ + "sri v1.16b, v1.16b, #5 \n" /* GGGGGGGG */ \ + "sri v0.16b, v0.16b, #5 \n" /* BBBBBBBB */ // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. #define RGB555TOARGB \ @@ -2112,14 +2114,15 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { - asm volatile ( - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. + asm volatile( + "1: \n" + "ldp q0, q4, [%0], #32 \n" // load 16 ARGB1555 pixels + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop ARGB1555TOARGB - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n" // store 16 ARGB + "add %1, %1, #64 \n" + "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2