mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
Row AArch64 Neon implementation - Part 4
BUG=319 TESTED=libyuv_unittest R=fbarchard@chromium.org, fbarchard@google.com Change-Id: If145660d999e95246efeedb64a45ba70bf0fe23e Signed-off-by: Ashok Bhat <ashok.bhat@arm.com> Review URL: https://webrtc-codereview.appspot.com/13199004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1054 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
720e3a247f
commit
cb8be2fb2b
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 1053
|
Version: 1054
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -279,25 +279,25 @@ extern "C" {
|
|||||||
// #define HAS_MIRRORROW_NEON
|
// #define HAS_MIRRORROW_NEON
|
||||||
// #define HAS_MIRRORUVROW_NEON
|
// #define HAS_MIRRORUVROW_NEON
|
||||||
// #define HAS_ARGBMIRRORROW_NEON
|
// #define HAS_ARGBMIRRORROW_NEON
|
||||||
// #define HAS_RGB24TOARGBROW_NEON
|
#define HAS_RGB24TOARGBROW_NEON
|
||||||
// #define HAS_RAWTOARGBROW_NEON
|
#define HAS_RAWTOARGBROW_NEON
|
||||||
// #define HAS_RGB565TOARGBROW_NEON
|
// #define HAS_RGB565TOARGBROW_NEON
|
||||||
// #define HAS_ARGB1555TOARGBROW_NEON
|
// #define HAS_ARGB1555TOARGBROW_NEON
|
||||||
// #define HAS_ARGB4444TOARGBROW_NEON
|
// #define HAS_ARGB4444TOARGBROW_NEON
|
||||||
// #define HAS_ARGBTORGB24ROW_NEON
|
#define HAS_ARGBTORGB24ROW_NEON
|
||||||
// #define HAS_ARGBTORAWROW_NEON
|
#define HAS_ARGBTORAWROW_NEON
|
||||||
// #define HAS_YUY2TOYROW_NEON
|
#define HAS_YUY2TOYROW_NEON
|
||||||
// #define HAS_UYVYTOYROW_NEON
|
#define HAS_UYVYTOYROW_NEON
|
||||||
// #define HAS_YUY2TOUV422ROW_NEON
|
#define HAS_YUY2TOUV422ROW_NEON
|
||||||
// #define HAS_UYVYTOUV422ROW_NEON
|
#define HAS_UYVYTOUV422ROW_NEON
|
||||||
// #define HAS_YUY2TOUVROW_NEON
|
#define HAS_YUY2TOUVROW_NEON
|
||||||
// #define HAS_UYVYTOUVROW_NEON
|
#define HAS_UYVYTOUVROW_NEON
|
||||||
// #define HAS_HALFROW_NEON
|
#define HAS_HALFROW_NEON
|
||||||
// #define HAS_ARGBTOBAYERROW_NEON
|
#define HAS_ARGBTOBAYERROW_NEON
|
||||||
// #define HAS_ARGBTOBAYERGGROW_NEON
|
#define HAS_ARGBTOBAYERGGROW_NEON
|
||||||
// #define HAS_ARGBSHUFFLEROW_NEON
|
#define HAS_ARGBSHUFFLEROW_NEON
|
||||||
// #define HAS_I422TOYUY2ROW_NEON
|
#define HAS_I422TOYUY2ROW_NEON
|
||||||
// #define HAS_I422TOUYVYROW_NEON
|
#define HAS_I422TOUYVYROW_NEON
|
||||||
// #define HAS_ARGBTORGB565ROW_NEON
|
// #define HAS_ARGBTORGB565ROW_NEON
|
||||||
// #define HAS_ARGBTOARGB1555ROW_NEON
|
// #define HAS_ARGBTOARGB1555ROW_NEON
|
||||||
// #define HAS_ARGBTOARGB4444ROW_NEON
|
// #define HAS_ARGBTOARGB4444ROW_NEON
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1053
|
#define LIBYUV_VERSION 1054
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||||
|
|||||||
@ -79,9 +79,13 @@ YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C,
|
|||||||
YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
|
YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
|
||||||
1, 2, 7)
|
1, 2, 7)
|
||||||
YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
|
YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
|
||||||
YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
|
|
||||||
YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
|
|
||||||
#endif // HAS_I422TOARGBROW_NEON
|
#endif // HAS_I422TOARGBROW_NEON
|
||||||
|
#ifdef HAS_I422TOYUY2ROW_NEON
|
||||||
|
YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
|
||||||
|
#endif // HAS_I422TOYUY2ROW_NEON
|
||||||
|
#ifdef HAS_I422TOUYVYROW_NEON
|
||||||
|
YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
|
||||||
|
#endif // HAS_I422TOUYVYROW_NEON
|
||||||
#undef YANY
|
#undef YANY
|
||||||
|
|
||||||
// Wrappers to handle odd width
|
// Wrappers to handle odd width
|
||||||
@ -250,12 +254,26 @@ YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8)
|
|||||||
YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
|
YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
|
||||||
YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
|
YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
|
||||||
YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
|
YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
|
||||||
|
#endif
|
||||||
|
#ifdef HAS_YUY2TOYROW_NEON
|
||||||
YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)
|
YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)
|
||||||
|
#endif
|
||||||
|
#ifdef HAS_UYVYTOYROW_NEON
|
||||||
YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)
|
YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)
|
||||||
|
#endif
|
||||||
|
#ifdef HAS_RGB24TOARGBROW_NEON
|
||||||
YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)
|
YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)
|
||||||
|
#endif
|
||||||
|
#ifdef HAS_RAWTOARGBROW_NEON
|
||||||
YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)
|
YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)
|
||||||
|
#endif
|
||||||
|
#ifdef HAS_RGB565TOARGBROW_NEON
|
||||||
YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)
|
YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)
|
||||||
|
#endif
|
||||||
|
#ifdef HAS_ARGB1555TOARGBROW_NEON
|
||||||
YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8)
|
YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8)
|
||||||
|
#endif
|
||||||
|
#ifdef HAS_ARGB4444TOARGBROW_NEON
|
||||||
YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
|
YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
|
||||||
#endif
|
#endif
|
||||||
#undef YANY
|
#undef YANY
|
||||||
@ -333,7 +351,11 @@ UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)
|
|||||||
UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
|
UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
|
||||||
UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
|
UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
|
||||||
UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
|
UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
|
||||||
|
#endif
|
||||||
|
#ifdef HAS_YUY2TOUVROW_NEON
|
||||||
UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15)
|
UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15)
|
||||||
|
#endif
|
||||||
|
#ifdef HAS_UYVYTOUVROW_NEON
|
||||||
UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
|
UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
|
||||||
#endif
|
#endif
|
||||||
#undef UVANY
|
#undef UVANY
|
||||||
|
|||||||
@ -1007,20 +1007,20 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
|||||||
#ifdef HAS_RGB24TOARGBROW_NEON
|
#ifdef HAS_RGB24TOARGBROW_NEON
|
||||||
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
|
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"vmov.u8 d4, #255 \n" // Alpha
|
"movi v4.8b, #255 \n" // Alpha
|
||||||
".p2align 2 \n"
|
".p2align 2 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
MEMACCESS(0)
|
MEMACCESS(0)
|
||||||
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
|
"ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||||
MEMACCESS(1)
|
MEMACCESS(1)
|
||||||
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
|
"st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_rgb24), // %0
|
: "+r"(src_rgb24), // %0
|
||||||
"+r"(dst_argb), // %1
|
"+r"(dst_argb), // %1
|
||||||
"+r"(pix) // %2
|
"+r"(pix) // %2
|
||||||
:
|
:
|
||||||
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
|
: "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif // HAS_RGB24TOARGBROW_NEON
|
#endif // HAS_RGB24TOARGBROW_NEON
|
||||||
@ -1028,21 +1028,22 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
|
|||||||
#ifdef HAS_RAWTOARGBROW_NEON
|
#ifdef HAS_RAWTOARGBROW_NEON
|
||||||
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
|
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"vmov.u8 d4, #255 \n" // Alpha
|
"movi v5.8b, #255 \n" // Alpha
|
||||||
".p2align 2 \n"
|
".p2align 2 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
MEMACCESS(0)
|
MEMACCESS(0)
|
||||||
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
|
"ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||||
"vswp.u8 d1, d3 \n" // swap R, B
|
"mov v3.8b, v1.8b \n" // move g
|
||||||
|
"mov v4.8b, v0.8b \n" // move r
|
||||||
MEMACCESS(1)
|
MEMACCESS(1)
|
||||||
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
|
"st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_raw), // %0
|
: "+r"(src_raw), // %0
|
||||||
"+r"(dst_argb), // %1
|
"+r"(dst_argb), // %1
|
||||||
"+r"(pix) // %2
|
"+r"(pix) // %2
|
||||||
:
|
:
|
||||||
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif // HAS_RAWTOARGBROW_NEON
|
#endif // HAS_RAWTOARGBROW_NEON
|
||||||
@ -1170,16 +1171,16 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
|
|||||||
".p2align 2 \n"
|
".p2align 2 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
MEMACCESS(0)
|
MEMACCESS(0)
|
||||||
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
|
"ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||||
MEMACCESS(1)
|
MEMACCESS(1)
|
||||||
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
|
"st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(dst_rgb24), // %1
|
"+r"(dst_rgb24), // %1
|
||||||
"+r"(pix) // %2
|
"+r"(pix) // %2
|
||||||
:
|
:
|
||||||
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
|
: "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif // HAS_ARGBTORGB24ROW_NEON
|
#endif // HAS_ARGBTORGB24ROW_NEON
|
||||||
@ -1190,17 +1191,18 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
|
|||||||
".p2align 2 \n"
|
".p2align 2 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
MEMACCESS(0)
|
MEMACCESS(0)
|
||||||
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
|
"ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||||
"vswp.u8 d1, d3 \n" // swap R, B
|
"mov v4.8b, v2.8b \n" // mov g
|
||||||
|
"mov v5.8b, v1.8b \n" // mov b
|
||||||
MEMACCESS(1)
|
MEMACCESS(1)
|
||||||
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
|
"st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(dst_raw), // %1
|
"+r"(dst_raw), // %1
|
||||||
"+r"(pix) // %2
|
"+r"(pix) // %2
|
||||||
:
|
:
|
||||||
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
|
: "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif // HAS_ARGBTORAWROW_NEON
|
#endif // HAS_ARGBTORAWROW_NEON
|
||||||
@ -1211,16 +1213,16 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
|
|||||||
".p2align 2 \n"
|
".p2align 2 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
MEMACCESS(0)
|
MEMACCESS(0)
|
||||||
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
|
"ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
|
||||||
"subs %2, %2, #16 \n" // 16 processed per loop.
|
"subs %2, %2, #16 \n" // 16 processed per loop.
|
||||||
MEMACCESS(1)
|
MEMACCESS(1)
|
||||||
"vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
|
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_yuy2), // %0
|
: "+r"(src_yuy2), // %0
|
||||||
"+r"(dst_y), // %1
|
"+r"(dst_y), // %1
|
||||||
"+r"(pix) // %2
|
"+r"(pix) // %2
|
||||||
:
|
:
|
||||||
: "cc", "memory", "q0", "q1" // Clobber List
|
: "cc", "memory", "v0", "v1" // Clobber List
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif // HAS_YUY2TOYROW_NEON
|
#endif // HAS_YUY2TOYROW_NEON
|
||||||
@ -1231,16 +1233,16 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
|
|||||||
".p2align 2 \n"
|
".p2align 2 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
MEMACCESS(0)
|
MEMACCESS(0)
|
||||||
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
|
"ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
|
||||||
"subs %2, %2, #16 \n" // 16 processed per loop.
|
"subs %2, %2, #16 \n" // 16 processed per loop.
|
||||||
MEMACCESS(1)
|
MEMACCESS(1)
|
||||||
"vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
|
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_uyvy), // %0
|
: "+r"(src_uyvy), // %0
|
||||||
"+r"(dst_y), // %1
|
"+r"(dst_y), // %1
|
||||||
"+r"(pix) // %2
|
"+r"(pix) // %2
|
||||||
:
|
:
|
||||||
: "cc", "memory", "q0", "q1" // Clobber List
|
: "cc", "memory", "v0", "v1" // Clobber List
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif // HAS_UYVYTOYROW_NEON
|
#endif // HAS_UYVYTOYROW_NEON
|
||||||
@ -1252,19 +1254,19 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
|
|||||||
".p2align 2 \n"
|
".p2align 2 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
MEMACCESS(0)
|
MEMACCESS(0)
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
|
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
|
||||||
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
||||||
MEMACCESS(1)
|
MEMACCESS(1)
|
||||||
"vst1.8 {d1}, [%1]! \n" // store 8 U.
|
"st1 {v1.8b}, [%1], #8 \n" // store 8 U.
|
||||||
MEMACCESS(2)
|
MEMACCESS(2)
|
||||||
"vst1.8 {d3}, [%2]! \n" // store 8 V.
|
"st1 {v3.8b}, [%2], #8 \n" // store 8 V.
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_yuy2), // %0
|
: "+r"(src_yuy2), // %0
|
||||||
"+r"(dst_u), // %1
|
"+r"(dst_u), // %1
|
||||||
"+r"(dst_v), // %2
|
"+r"(dst_v), // %2
|
||||||
"+r"(pix) // %3
|
"+r"(pix) // %3
|
||||||
:
|
:
|
||||||
: "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
|
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif // HAS_YUY2TOUV422ROW_NEON
|
#endif // HAS_YUY2TOUV422ROW_NEON
|
||||||
@ -1276,19 +1278,19 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
|
|||||||
".p2align 2 \n"
|
".p2align 2 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
MEMACCESS(0)
|
MEMACCESS(0)
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
|
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
|
||||||
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
||||||
MEMACCESS(1)
|
MEMACCESS(1)
|
||||||
"vst1.8 {d0}, [%1]! \n" // store 8 U.
|
"st1 {v0.8b}, [%1], #8 \n" // store 8 U.
|
||||||
MEMACCESS(2)
|
MEMACCESS(2)
|
||||||
"vst1.8 {d2}, [%2]! \n" // store 8 V.
|
"st1 {v2.8b}, [%2], #8 \n" // store 8 V.
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_uyvy), // %0
|
: "+r"(src_uyvy), // %0
|
||||||
"+r"(dst_u), // %1
|
"+r"(dst_u), // %1
|
||||||
"+r"(dst_v), // %2
|
"+r"(dst_v), // %2
|
||||||
"+r"(pix) // %3
|
"+r"(pix) // %3
|
||||||
:
|
:
|
||||||
: "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
|
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif // HAS_UYVYTOUV422ROW_NEON
|
#endif // HAS_UYVYTOUV422ROW_NEON
|
||||||
@ -1297,20 +1299,20 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
|
|||||||
void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
|
void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
|
||||||
uint8* dst_u, uint8* dst_v, int pix) {
|
uint8* dst_u, uint8* dst_v, int pix) {
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"add %1, %0, %1 \n" // stride + src_yuy2
|
"add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2
|
||||||
".p2align 2 \n"
|
".p2align 2 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
MEMACCESS(0)
|
MEMACCESS(0)
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
|
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
|
||||||
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
|
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
|
||||||
MEMACCESS(1)
|
MEMACCESS(1)
|
||||||
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
|
"ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2.
|
||||||
"vrhadd.u8 d1, d1, d5 \n" // average rows of U
|
"urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
|
||||||
"vrhadd.u8 d3, d3, d7 \n" // average rows of V
|
"urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
|
||||||
MEMACCESS(2)
|
MEMACCESS(2)
|
||||||
"vst1.8 {d1}, [%2]! \n" // store 8 U.
|
"st1 {v1.8b}, [%2], #8 \n" // store 8 U.
|
||||||
MEMACCESS(3)
|
MEMACCESS(3)
|
||||||
"vst1.8 {d3}, [%3]! \n" // store 8 V.
|
"st1 {v3.8b}, [%3], #8 \n" // store 8 V.
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_yuy2), // %0
|
: "+r"(src_yuy2), // %0
|
||||||
"+r"(stride_yuy2), // %1
|
"+r"(stride_yuy2), // %1
|
||||||
@ -1318,7 +1320,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
|
|||||||
"+r"(dst_v), // %3
|
"+r"(dst_v), // %3
|
||||||
"+r"(pix) // %4
|
"+r"(pix) // %4
|
||||||
:
|
:
|
||||||
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif // HAS_YUY2TOUVROW_NEON
|
#endif // HAS_YUY2TOUVROW_NEON
|
||||||
@ -1327,20 +1329,20 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
|
|||||||
void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
|
void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
|
||||||
uint8* dst_u, uint8* dst_v, int pix) {
|
uint8* dst_u, uint8* dst_v, int pix) {
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"add %1, %0, %1 \n" // stride + src_uyvy
|
"add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy
|
||||||
".p2align 2 \n"
|
".p2align 2 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
MEMACCESS(0)
|
MEMACCESS(0)
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
|
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
|
||||||
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
|
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
|
||||||
MEMACCESS(1)
|
MEMACCESS(1)
|
||||||
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
|
"ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY.
|
||||||
"vrhadd.u8 d0, d0, d4 \n" // average rows of U
|
"urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
|
||||||
"vrhadd.u8 d2, d2, d6 \n" // average rows of V
|
"urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
|
||||||
MEMACCESS(2)
|
MEMACCESS(2)
|
||||||
"vst1.8 {d0}, [%2]! \n" // store 8 U.
|
"st1 {v0.8b}, [%2], #8 \n" // store 8 U.
|
||||||
MEMACCESS(3)
|
MEMACCESS(3)
|
||||||
"vst1.8 {d2}, [%3]! \n" // store 8 V.
|
"st1 {v2.8b}, [%3], #8 \n" // store 8 V.
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_uyvy), // %0
|
: "+r"(src_uyvy), // %0
|
||||||
"+r"(stride_uyvy), // %1
|
"+r"(stride_uyvy), // %1
|
||||||
@ -1348,7 +1350,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
|
|||||||
"+r"(dst_v), // %3
|
"+r"(dst_v), // %3
|
||||||
"+r"(pix) // %4
|
"+r"(pix) // %4
|
||||||
:
|
:
|
||||||
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif // HAS_UYVYTOUVROW_NEON
|
#endif // HAS_UYVYTOUVROW_NEON
|
||||||
@ -1358,23 +1360,23 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
|
|||||||
uint8* dst_uv, int pix) {
|
uint8* dst_uv, int pix) {
|
||||||
asm volatile (
|
asm volatile (
|
||||||
// change the stride to row 2 pointer
|
// change the stride to row 2 pointer
|
||||||
"add %1, %0 \n"
|
"add %x1, %x0, %w1, sxtw \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
MEMACCESS(0)
|
MEMACCESS(0)
|
||||||
"vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels.
|
"ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels.
|
||||||
"subs %3, %3, #16 \n" // 16 processed per loop
|
"subs %3, %3, #16 \n" // 16 processed per loop
|
||||||
MEMACCESS(1)
|
MEMACCESS(1)
|
||||||
"vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels.
|
"ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels.
|
||||||
"vrhadd.u8 q0, q1 \n" // average row 1 and 2
|
"urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2
|
||||||
MEMACCESS(2)
|
MEMACCESS(2)
|
||||||
"vst1.8 {q0}, [%2]! \n"
|
"st1 {v0.16b}, [%2], #16 \n"
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_uv), // %0
|
: "+r"(src_uv), // %0
|
||||||
"+r"(src_uv_stride), // %1
|
"+r"(src_uv_stride), // %1
|
||||||
"+r"(dst_uv), // %2
|
"+r"(dst_uv), // %2
|
||||||
"+r"(pix) // %3
|
"+r"(pix) // %3
|
||||||
:
|
:
|
||||||
: "cc", "memory", "q0", "q1" // Clobber List
|
: "cc", "memory", "v0", "v1" // Clobber List
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif // HAS_HALFROW_NEON
|
#endif // HAS_HALFROW_NEON
|
||||||
@ -1384,22 +1386,22 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
|
|||||||
void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
|
void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
|
||||||
uint32 selector, int pix) {
|
uint32 selector, int pix) {
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"vmov.u32 d6[0], %3 \n" // selector
|
"mov v2.s[0], %w3 \n" // selector
|
||||||
"1: \n"
|
"1: \n"
|
||||||
MEMACCESS(0)
|
MEMACCESS(0)
|
||||||
"vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels.
|
"ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels.
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||||
"vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
|
"tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels
|
||||||
"vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
|
"tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels
|
||||||
"vtrn.u32 d4, d5 \n" // combine 8 pixels
|
"trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels
|
||||||
MEMACCESS(1)
|
MEMACCESS(1)
|
||||||
"vst1.8 {d4}, [%1]! \n" // store 8.
|
"st1 {v4.8b}, [%1], #8 \n" // store 8.
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(dst_bayer), // %1
|
"+r"(dst_bayer), // %1
|
||||||
"+r"(pix) // %2
|
"+r"(pix) // %2
|
||||||
: "r"(selector) // %3
|
: "r"(selector) // %3
|
||||||
: "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
|
: "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif // HAS_ARGBTOBAYERROW_NEON
|
#endif // HAS_ARGBTOBAYERROW_NEON
|
||||||
@ -1411,16 +1413,16 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
|
|||||||
asm volatile (
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
MEMACCESS(0)
|
MEMACCESS(0)
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels.
|
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels.
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||||
MEMACCESS(1)
|
MEMACCESS(1)
|
||||||
"vst1.8 {d1}, [%1]! \n" // store 8 G's.
|
"st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(dst_bayer), // %1
|
"+r"(dst_bayer), // %1
|
||||||
"+r"(pix) // %2
|
"+r"(pix) // %2
|
||||||
:
|
:
|
||||||
: "cc", "memory", "q0", "q1" // Clobber List
|
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif // HAS_ARGBTOBAYERGGROW_NEON
|
#endif // HAS_ARGBTOBAYERGGROW_NEON
|
||||||
@ -1431,21 +1433,20 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
|
|||||||
const uint8* shuffler, int pix) {
|
const uint8* shuffler, int pix) {
|
||||||
asm volatile (
|
asm volatile (
|
||||||
MEMACCESS(3)
|
MEMACCESS(3)
|
||||||
"vld1.8 {q2}, [%3] \n" // shuffler
|
"ld1 {v2.16b}, [%3] \n" // shuffler
|
||||||
"1: \n"
|
"1: \n"
|
||||||
MEMACCESS(0)
|
MEMACCESS(0)
|
||||||
"vld1.8 {q0}, [%0]! \n" // load 4 pixels.
|
"ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
|
||||||
"subs %2, %2, #4 \n" // 4 processed per loop
|
"subs %2, %2, #4 \n" // 4 processed per loop
|
||||||
"vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
|
"tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
|
||||||
"vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
|
|
||||||
MEMACCESS(1)
|
MEMACCESS(1)
|
||||||
"vst1.8 {q1}, [%1]! \n" // store 4.
|
"st1 {v1.16b}, [%1], #16 \n" // store 4.
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(dst_argb), // %1
|
"+r"(dst_argb), // %1
|
||||||
"+r"(pix) // %2
|
"+r"(pix) // %2
|
||||||
: "r"(shuffler) // %3
|
: "r"(shuffler) // %3
|
||||||
: "cc", "memory", "q0", "q1", "q2" // Clobber List
|
: "cc", "memory", "v0", "v1", "v2" // Clobber List
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif // HAS_ARGBSHUFFLEROW_NEON
|
#endif // HAS_ARGBSHUFFLEROW_NEON
|
||||||
@ -1459,14 +1460,15 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
|
|||||||
".p2align 2 \n"
|
".p2align 2 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
MEMACCESS(0)
|
MEMACCESS(0)
|
||||||
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
|
"ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
|
||||||
|
"mov v2.8b, v1.8b \n"
|
||||||
MEMACCESS(1)
|
MEMACCESS(1)
|
||||||
"vld1.8 {d1}, [%1]! \n" // load 8 Us
|
"ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
|
||||||
MEMACCESS(2)
|
MEMACCESS(2)
|
||||||
"vld1.8 {d3}, [%2]! \n" // load 8 Vs
|
"ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
|
||||||
"subs %4, %4, #16 \n" // 16 pixels
|
"subs %4, %4, #16 \n" // 16 pixels
|
||||||
MEMACCESS(3)
|
MEMACCESS(3)
|
||||||
"vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
|
"st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels.
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_y), // %0
|
: "+r"(src_y), // %0
|
||||||
"+r"(src_u), // %1
|
"+r"(src_u), // %1
|
||||||
@ -1474,7 +1476,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
|
|||||||
"+r"(dst_yuy2), // %3
|
"+r"(dst_yuy2), // %3
|
||||||
"+r"(width) // %4
|
"+r"(width) // %4
|
||||||
:
|
:
|
||||||
: "cc", "memory", "d0", "d1", "d2", "d3"
|
: "cc", "memory", "v0", "v1", "v2", "v3"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif // HAS_I422TOYUY2ROW_NEON
|
#endif // HAS_I422TOYUY2ROW_NEON
|
||||||
@ -1488,14 +1490,15 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
|
|||||||
".p2align 2 \n"
|
".p2align 2 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
MEMACCESS(0)
|
MEMACCESS(0)
|
||||||
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
|
"ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys
|
||||||
|
"mov v3.8b, v2.8b \n"
|
||||||
MEMACCESS(1)
|
MEMACCESS(1)
|
||||||
"vld1.8 {d0}, [%1]! \n" // load 8 Us
|
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
|
||||||
MEMACCESS(2)
|
MEMACCESS(2)
|
||||||
"vld1.8 {d2}, [%2]! \n" // load 8 Vs
|
"ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
|
||||||
"subs %4, %4, #16 \n" // 16 pixels
|
"subs %4, %4, #16 \n" // 16 pixels
|
||||||
MEMACCESS(3)
|
MEMACCESS(3)
|
||||||
"vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
|
"st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels.
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_y), // %0
|
: "+r"(src_y), // %0
|
||||||
"+r"(src_u), // %1
|
"+r"(src_u), // %1
|
||||||
@ -1503,7 +1506,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
|
|||||||
"+r"(dst_uyvy), // %3
|
"+r"(dst_uyvy), // %3
|
||||||
"+r"(width) // %4
|
"+r"(width) // %4
|
||||||
:
|
:
|
||||||
: "cc", "memory", "d0", "d1", "d2", "d3"
|
: "cc", "memory", "v0", "v1", "v2", "v3"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif // HAS_I422TOUYVYROW_NEON
|
#endif // HAS_I422TOUYVYROW_NEON
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user