mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Revert "Disable NV12ToARGB_SVE2 which fails the 'any' test"
This reverts commit f480fa1c4a4af0ce3c34cd7b1ab0d85f1a36ce17. This code has a number of small issues: * The YUVTORGB_SVE_SETUP macro requires p0 to be initialized to all-true, however the existing kernel does not initialise p0 until after this macro is called, so flip the order. * The p2 register is missing from the clobber list, so add it. * The existing code uses the wrong condition flags when determining whether to do the tail iteration using WHILE instructions or not. Additionally the number of tail iterations is incorrect, as it was incorrectly not changed from when the tail code was always executed. While we are here, make another few small improvements: * Remove the single-quote digit separators as requested here: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5622133 * Remove "volatile" from the asm block counting the vector length. This particular asm block cannot be removed by the compiler since the output register is consumed by subsequent code, so "volatile" is unnecessary here and we remove it. * Add some additional empty comments to force clang-format to put macros into the next line rather than on the same line as other asm. Bug: b/352371649 Change-Id: I45676fab95343f588cf11ce2cf9186ffbe87489e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5703586 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
e1a93c79fc
commit
a64fffe632
@ -600,9 +600,8 @@ extern "C" {
|
|||||||
#define HAS_I422TORGBAROW_SVE2
|
#define HAS_I422TORGBAROW_SVE2
|
||||||
#define HAS_I444ALPHATOARGBROW_SVE2
|
#define HAS_I444ALPHATOARGBROW_SVE2
|
||||||
#define HAS_I444TOARGBROW_SVE2
|
#define HAS_I444TOARGBROW_SVE2
|
||||||
// Any support for NV12 SVE2 fails
|
#define HAS_NV12TOARGBROW_SVE2
|
||||||
//#define HAS_NV12TOARGBROW_SVE2
|
#define HAS_NV21TOARGBROW_SVE2
|
||||||
//#define HAS_NV21TOARGBROW_SVE2
|
|
||||||
#define HAS_RAWTOARGBROW_SVE2
|
#define HAS_RAWTOARGBROW_SVE2
|
||||||
#define HAS_RAWTORGB24ROW_SVE2
|
#define HAS_RAWTORGB24ROW_SVE2
|
||||||
#define HAS_RAWTORGBAROW_SVE2
|
#define HAS_RAWTORGBAROW_SVE2
|
||||||
|
|||||||
@ -434,29 +434,27 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
|
|||||||
uint32_t nv_uv_start,
|
uint32_t nv_uv_start,
|
||||||
uint32_t nv_uv_step) {
|
uint32_t nv_uv_step) {
|
||||||
uint64_t vl;
|
uint64_t vl;
|
||||||
asm volatile (
|
asm("cnth %0" : "=r"(vl));
|
||||||
"cnth %0" : "=r"(vl));
|
|
||||||
int width_last_y = width & (vl - 1);
|
int width_last_y = width & (vl - 1);
|
||||||
width_last_y = width_last_y == 0 ? vl : width_last_y;
|
|
||||||
int width_last_uv = width_last_y + (width_last_y & 1);
|
int width_last_uv = width_last_y + (width_last_y & 1);
|
||||||
asm volatile(
|
asm volatile(
|
||||||
|
"ptrue p0.b \n" //
|
||||||
YUVTORGB_SVE_SETUP
|
YUVTORGB_SVE_SETUP
|
||||||
"ptrue p0.b \n"
|
|
||||||
"index z22.s, %w[nv_uv_start], %w[nv_uv_step] \n"
|
"index z22.s, %w[nv_uv_start], %w[nv_uv_step] \n"
|
||||||
"dup z19.b, #255 \n" // A
|
"dup z19.b, #255 \n" // A
|
||||||
"subs %w[width], %w[width], %w[vl] \n"
|
"subs %w[width], %w[width], %w[vl] \n"
|
||||||
"b.le 2f \n"
|
"b.lt 2f \n"
|
||||||
|
|
||||||
// Run bulk of computation with an all-true predicate to avoid predicate
|
// Run bulk of computation with an all-true predicate to avoid predicate
|
||||||
// generation overhead.
|
// generation overhead.
|
||||||
"ptrue p1.h \n"
|
"ptrue p1.h \n"
|
||||||
"ptrue p2.h \n"
|
"ptrue p2.h \n"
|
||||||
"1: \n" READNV_SVE
|
"1: \n" //
|
||||||
NVTORGB_SVE RGBTOARGB8_SVE
|
READNV_SVE NVTORGB_SVE RGBTOARGB8_SVE
|
||||||
"subs %w[width], %w[width], %w[vl] \n"
|
"subs %w[width], %w[width], %w[vl] \n"
|
||||||
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
|
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
|
||||||
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
|
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
|
||||||
"b.gt 1b \n"
|
"b.ge 1b \n"
|
||||||
|
|
||||||
"2: \n"
|
"2: \n"
|
||||||
"adds %w[width], %w[width], %w[vl] \n"
|
"adds %w[width], %w[width], %w[vl] \n"
|
||||||
@ -465,8 +463,8 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
|
|||||||
// Calculate a predicate for the final iteration to deal with the tail.
|
// Calculate a predicate for the final iteration to deal with the tail.
|
||||||
"3: \n"
|
"3: \n"
|
||||||
"whilelt p1.h, wzr, %w[width_last_y] \n"
|
"whilelt p1.h, wzr, %w[width_last_y] \n"
|
||||||
"whilelt p2.h, wzr, %w[width_last_uv] \n" READNV_SVE
|
"whilelt p2.h, wzr, %w[width_last_uv] \n" //
|
||||||
NVTORGB_SVE RGBTOARGB8_SVE
|
READNV_SVE NVTORGB_SVE RGBTOARGB8_SVE
|
||||||
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
|
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
|
||||||
|
|
||||||
"99: \n"
|
"99: \n"
|
||||||
@ -481,7 +479,7 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
|
|||||||
[nv_uv_step] "r"(nv_uv_step), // %[nv_uv_step]
|
[nv_uv_step] "r"(nv_uv_step), // %[nv_uv_step]
|
||||||
[width_last_y] "r"(width_last_y), // %[width_last_y]
|
[width_last_y] "r"(width_last_y), // %[width_last_y]
|
||||||
[width_last_uv] "r"(width_last_uv) // %[width_last_uv]
|
[width_last_uv] "r"(width_last_uv) // %[width_last_uv]
|
||||||
: "cc", "memory", YUVTORGB_SVE_REGS);
|
: "cc", "memory", YUVTORGB_SVE_REGS, "p2");
|
||||||
}
|
}
|
||||||
|
|
||||||
void NV12ToARGBRow_SVE2(const uint8_t* src_y,
|
void NV12ToARGBRow_SVE2(const uint8_t* src_y,
|
||||||
@ -489,8 +487,8 @@ void NV12ToARGBRow_SVE2(const uint8_t* src_y,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
uint32_t nv_uv_start = 0x0200'0200U;
|
uint32_t nv_uv_start = 0x02000200U;
|
||||||
uint32_t nv_uv_step = 0x0404'0404U;
|
uint32_t nv_uv_step = 0x04040404U;
|
||||||
NVToARGBRow_SVE2(src_y, src_uv, dst_argb, yuvconstants, width, nv_uv_start,
|
NVToARGBRow_SVE2(src_y, src_uv, dst_argb, yuvconstants, width, nv_uv_start,
|
||||||
nv_uv_step);
|
nv_uv_step);
|
||||||
}
|
}
|
||||||
@ -500,8 +498,8 @@ void NV21ToARGBRow_SVE2(const uint8_t* src_y,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
uint32_t nv_uv_start = 0x0002'0002U;
|
uint32_t nv_uv_start = 0x00020002U;
|
||||||
uint32_t nv_uv_step = 0x0404'0404U;
|
uint32_t nv_uv_step = 0x04040404U;
|
||||||
NVToARGBRow_SVE2(src_y, src_vu, dst_argb, yuvconstants, width, nv_uv_start,
|
NVToARGBRow_SVE2(src_y, src_vu, dst_argb, yuvconstants, width, nv_uv_start,
|
||||||
nv_uv_step);
|
nv_uv_step);
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user