mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
ARGBToUV 64 bit use ymm8 for shuffler
Bug: 381138208 Change-Id: I5e69bc1610bd6269bf9a4113e729cf307dd36f60 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6536833 Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
parent
61bdaee13a
commit
0853c9353f
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: https://chromium.googlesource.com/libyuv/libyuv/
|
URL: https://chromium.googlesource.com/libyuv/libyuv/
|
||||||
Version: 1909
|
Version: 1910
|
||||||
License: BSD-3-Clause
|
License: BSD-3-Clause
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
Shipped: yes
|
Shipped: yes
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1909
|
#define LIBYUV_VERSION 1910
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|||||||
@ -116,7 +116,8 @@ uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) {
|
|||||||
uint32_t hash = seed;
|
uint32_t hash = seed;
|
||||||
const uint32_t c16 = 0x92d9e201; // 33^16
|
const uint32_t c16 = 0x92d9e201; // 33^16
|
||||||
uint32_t tmp, tmp2;
|
uint32_t tmp, tmp2;
|
||||||
asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
|
asm(
|
||||||
|
"ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
|
||||||
"ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n"
|
"ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n"
|
||||||
|
|
||||||
// count is always a multiple of 16.
|
// count is always a multiple of 16.
|
||||||
|
|||||||
1018
source/row_gcc.cc
1018
source/row_gcc.cc
File diff suppressed because it is too large
Load Diff
@ -291,12 +291,12 @@ void I210ToAR30Row_NEON(const uint16_t* src_y,
|
|||||||
uint16_t limit = 0x3ff0;
|
uint16_t limit = 0x3ff0;
|
||||||
uint16_t alpha = 0xc000;
|
uint16_t alpha = 0xc000;
|
||||||
asm volatile(YUVTORGB_SETUP
|
asm volatile(YUVTORGB_SETUP
|
||||||
"dup v22.8h, %w[limit] \n"
|
"dup v22.8h, %w[limit] \n"
|
||||||
"dup v23.8h, %w[alpha] \n"
|
"dup v23.8h, %w[alpha] \n"
|
||||||
"1: \n" //
|
"1: \n" //
|
||||||
READYUV210
|
READYUV210
|
||||||
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
|
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: [src_y] "+r"(src_y), // %[src_y]
|
: [src_y] "+r"(src_y), // %[src_y]
|
||||||
[src_u] "+r"(src_u), // %[src_u]
|
[src_u] "+r"(src_u), // %[src_u]
|
||||||
[src_v] "+r"(src_v), // %[src_v]
|
[src_v] "+r"(src_v), // %[src_v]
|
||||||
@ -320,12 +320,12 @@ void I410ToAR30Row_NEON(const uint16_t* src_y,
|
|||||||
uint16_t limit = 0x3ff0;
|
uint16_t limit = 0x3ff0;
|
||||||
uint16_t alpha = 0xc000;
|
uint16_t alpha = 0xc000;
|
||||||
asm volatile(YUVTORGB_SETUP
|
asm volatile(YUVTORGB_SETUP
|
||||||
"dup v22.8h, %w[limit] \n"
|
"dup v22.8h, %w[limit] \n"
|
||||||
"dup v23.8h, %w[alpha] \n"
|
"dup v23.8h, %w[alpha] \n"
|
||||||
"1: \n" //
|
"1: \n" //
|
||||||
READYUV410
|
READYUV410
|
||||||
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
|
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: [src_y] "+r"(src_y), // %[src_y]
|
: [src_y] "+r"(src_y), // %[src_y]
|
||||||
[src_u] "+r"(src_u), // %[src_u]
|
[src_u] "+r"(src_u), // %[src_u]
|
||||||
[src_v] "+r"(src_v), // %[src_v]
|
[src_v] "+r"(src_v), // %[src_v]
|
||||||
@ -348,12 +348,12 @@ void I212ToAR30Row_NEON(const uint16_t* src_y,
|
|||||||
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
|
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
|
||||||
const uint16_t limit = 0x3ff0;
|
const uint16_t limit = 0x3ff0;
|
||||||
asm volatile(YUVTORGB_SETUP
|
asm volatile(YUVTORGB_SETUP
|
||||||
"dup v22.8h, %w[limit] \n"
|
"dup v22.8h, %w[limit] \n"
|
||||||
"movi v23.8h, #0xc0, lsl #8 \n" // A
|
"movi v23.8h, #0xc0, lsl #8 \n" // A
|
||||||
"1: \n" //
|
"1: \n" //
|
||||||
READYUV212
|
READYUV212
|
||||||
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
|
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: [src_y] "+r"(src_y), // %[src_y]
|
: [src_y] "+r"(src_y), // %[src_y]
|
||||||
[src_u] "+r"(src_u), // %[src_u]
|
[src_u] "+r"(src_u), // %[src_u]
|
||||||
[src_v] "+r"(src_v), // %[src_v]
|
[src_v] "+r"(src_v), // %[src_v]
|
||||||
@ -530,13 +530,13 @@ void P210ToAR30Row_NEON(const uint16_t* src_y,
|
|||||||
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
|
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
|
||||||
const uint16_t limit = 0x3ff0;
|
const uint16_t limit = 0x3ff0;
|
||||||
asm volatile(YUVTORGB_SETUP
|
asm volatile(YUVTORGB_SETUP
|
||||||
"dup v22.8h, %w[limit] \n"
|
"dup v22.8h, %w[limit] \n"
|
||||||
"movi v23.8h, #0xc0, lsl #8 \n" // A
|
"movi v23.8h, #0xc0, lsl #8 \n" // A
|
||||||
"ldr q2, [%[kIndices]] \n"
|
"ldr q2, [%[kIndices]] \n"
|
||||||
"1: \n" //
|
"1: \n" //
|
||||||
READYUVP210
|
READYUVP210
|
||||||
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
|
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: [src_y] "+r"(src_y), // %[src_y]
|
: [src_y] "+r"(src_y), // %[src_y]
|
||||||
[src_uv] "+r"(src_uv), // %[src_uv]
|
[src_uv] "+r"(src_uv), // %[src_uv]
|
||||||
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
|
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
|
||||||
@ -557,13 +557,13 @@ void P410ToAR30Row_NEON(const uint16_t* src_y,
|
|||||||
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
|
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
|
||||||
uint16_t limit = 0x3ff0;
|
uint16_t limit = 0x3ff0;
|
||||||
asm volatile(YUVTORGB_SETUP
|
asm volatile(YUVTORGB_SETUP
|
||||||
"dup v22.8h, %w[limit] \n"
|
"dup v22.8h, %w[limit] \n"
|
||||||
"movi v23.8h, #0xc0, lsl #8 \n" // A
|
"movi v23.8h, #0xc0, lsl #8 \n" // A
|
||||||
"ldr q2, [%[kIndices]] \n"
|
"ldr q2, [%[kIndices]] \n"
|
||||||
"1: \n" //
|
"1: \n" //
|
||||||
READYUVP410
|
READYUVP410
|
||||||
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
|
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: [src_y] "+r"(src_y), // %[src_y]
|
: [src_y] "+r"(src_y), // %[src_y]
|
||||||
[src_uv] "+r"(src_uv), // %[src_uv]
|
[src_uv] "+r"(src_uv), // %[src_uv]
|
||||||
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
|
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
|
||||||
@ -3461,7 +3461,7 @@ static void ABCDToUVMatrixRow_NEON_I8MM(const uint8_t* src,
|
|||||||
// 16-bit)
|
// 16-bit)
|
||||||
"ld2r {v24.4s, v25.4s}, [%[uvconstants]] \n"
|
"ld2r {v24.4s, v25.4s}, [%[uvconstants]] \n"
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld2 {v0.4s, v1.4s}, [%[src]], #32 \n" // load 8 pixels
|
"ld2 {v0.4s, v1.4s}, [%[src]], #32 \n" // load 8 pixels
|
||||||
"ld2 {v2.4s, v3.4s}, [%[src]], #32 \n" // load 8 pixels
|
"ld2 {v2.4s, v3.4s}, [%[src]], #32 \n" // load 8 pixels
|
||||||
"subs %w[width], %w[width], #16 \n" // 16 processed per loop
|
"subs %w[width], %w[width], #16 \n" // 16 processed per loop
|
||||||
|
|||||||
@ -1759,25 +1759,25 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
|
|||||||
void ScaleAddRow_SSE2(const uint8_t* src_ptr,
|
void ScaleAddRow_SSE2(const uint8_t* src_ptr,
|
||||||
uint16_t* dst_ptr,
|
uint16_t* dst_ptr,
|
||||||
int src_width) {
|
int src_width) {
|
||||||
asm volatile("pxor %%xmm5,%%xmm5 \n"
|
asm volatile("pxor %%xmm5,%%xmm5 \n"
|
||||||
|
|
||||||
// 16 pixel loop.
|
// 16 pixel loop.
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm3 \n"
|
"movdqu (%0),%%xmm3 \n"
|
||||||
"lea 0x10(%0),%0 \n" // src_ptr += 16
|
"lea 0x10(%0),%0 \n" // src_ptr += 16
|
||||||
"movdqu (%1),%%xmm0 \n"
|
"movdqu (%1),%%xmm0 \n"
|
||||||
"movdqu 0x10(%1),%%xmm1 \n"
|
"movdqu 0x10(%1),%%xmm1 \n"
|
||||||
"movdqa %%xmm3,%%xmm2 \n"
|
"movdqa %%xmm3,%%xmm2 \n"
|
||||||
"punpcklbw %%xmm5,%%xmm2 \n"
|
"punpcklbw %%xmm5,%%xmm2 \n"
|
||||||
"punpckhbw %%xmm5,%%xmm3 \n"
|
"punpckhbw %%xmm5,%%xmm3 \n"
|
||||||
"paddusw %%xmm2,%%xmm0 \n"
|
"paddusw %%xmm2,%%xmm0 \n"
|
||||||
"paddusw %%xmm3,%%xmm1 \n"
|
"paddusw %%xmm3,%%xmm1 \n"
|
||||||
"movdqu %%xmm0,(%1) \n"
|
"movdqu %%xmm0,(%1) \n"
|
||||||
"movdqu %%xmm1,0x10(%1) \n"
|
"movdqu %%xmm1,0x10(%1) \n"
|
||||||
"lea 0x20(%1),%1 \n"
|
"lea 0x20(%1),%1 \n"
|
||||||
"sub $0x10,%2 \n"
|
"sub $0x10,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_ptr), // %0
|
: "+r"(src_ptr), // %0
|
||||||
"+r"(dst_ptr), // %1
|
"+r"(dst_ptr), // %1
|
||||||
"+r"(src_width) // %2
|
"+r"(src_width) // %2
|
||||||
@ -1790,23 +1790,23 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
|
|||||||
void ScaleAddRow_AVX2(const uint8_t* src_ptr,
|
void ScaleAddRow_AVX2(const uint8_t* src_ptr,
|
||||||
uint16_t* dst_ptr,
|
uint16_t* dst_ptr,
|
||||||
int src_width) {
|
int src_width) {
|
||||||
asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
|
asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
|
||||||
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vmovdqu (%0),%%ymm3 \n"
|
"vmovdqu (%0),%%ymm3 \n"
|
||||||
"lea 0x20(%0),%0 \n" // src_ptr += 32
|
"lea 0x20(%0),%0 \n" // src_ptr += 32
|
||||||
"vpermq $0xd8,%%ymm3,%%ymm3 \n"
|
"vpermq $0xd8,%%ymm3,%%ymm3 \n"
|
||||||
"vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
|
"vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
|
||||||
"vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
|
"vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
|
||||||
"vpaddusw (%1),%%ymm2,%%ymm0 \n"
|
"vpaddusw (%1),%%ymm2,%%ymm0 \n"
|
||||||
"vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
|
"vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
|
||||||
"vmovdqu %%ymm0,(%1) \n"
|
"vmovdqu %%ymm0,(%1) \n"
|
||||||
"vmovdqu %%ymm1,0x20(%1) \n"
|
"vmovdqu %%ymm1,0x20(%1) \n"
|
||||||
"lea 0x40(%1),%1 \n"
|
"lea 0x40(%1),%1 \n"
|
||||||
"sub $0x20,%2 \n"
|
"sub $0x20,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
"vzeroupper \n"
|
"vzeroupper \n"
|
||||||
: "+r"(src_ptr), // %0
|
: "+r"(src_ptr), // %0
|
||||||
"+r"(dst_ptr), // %1
|
"+r"(dst_ptr), // %1
|
||||||
"+r"(src_width) // %2
|
"+r"(src_width) // %2
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user