mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
AVX512 VMBI version of ARGBToRGB24
Use VMBI instructions but on AVX2 registers to avoid clockrate change. Bug: libyuv:778 Test: LibYUVConvertTest.NV21ToRGB24_Opt Change-Id: Id4f8ad1e0e142a380c8a46c5eab90ce145a10edd Reviewed-on: https://chromium-review.googlesource.com/956609 Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
parent
004954c969
commit
83aa7512c1
@ -43,7 +43,6 @@ cc_library {
|
|||||||
"source/scale_neon.cc",
|
"source/scale_neon.cc",
|
||||||
"source/scale_neon64.cc",
|
"source/scale_neon64.cc",
|
||||||
"source/video_common.cc",
|
"source/video_common.cc",
|
||||||
|
|
||||||
"source/convert_jpeg.cc",
|
"source/convert_jpeg.cc",
|
||||||
"source/mjpeg_decoder.cc",
|
"source/mjpeg_decoder.cc",
|
||||||
"source/mjpeg_validate.cc",
|
"source/mjpeg_validate.cc",
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 1703
|
Version: 1704
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -277,6 +277,7 @@ extern "C" {
|
|||||||
#define HAS_ARGBTOAR30ROW_AVX2
|
#define HAS_ARGBTOAR30ROW_AVX2
|
||||||
#define HAS_ARGBTORAWROW_AVX2
|
#define HAS_ARGBTORAWROW_AVX2
|
||||||
#define HAS_ARGBTORGB24ROW_AVX2
|
#define HAS_ARGBTORGB24ROW_AVX2
|
||||||
|
#define HAS_ARGBTORGB24ROW_AVX512VBMI
|
||||||
#define HAS_CONVERT16TO8ROW_AVX2
|
#define HAS_CONVERT16TO8ROW_AVX2
|
||||||
#define HAS_CONVERT8TO16ROW_AVX2
|
#define HAS_CONVERT8TO16ROW_AVX2
|
||||||
#define HAS_I210TOAR30ROW_AVX2
|
#define HAS_I210TOAR30ROW_AVX2
|
||||||
@ -1706,6 +1707,8 @@ void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
|
|||||||
void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
|
void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
|
||||||
void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
|
void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
|
||||||
|
|
||||||
|
void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width);
|
||||||
|
|
||||||
void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
|
void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
|
||||||
uint8_t* dst_rgb,
|
uint8_t* dst_rgb,
|
||||||
const uint32_t dither4,
|
const uint32_t dither4,
|
||||||
@ -2497,10 +2500,11 @@ void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
|
|||||||
void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
|
void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
|
||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int width);
|
int width);
|
||||||
void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr,
|
void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||||
|
void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr,
|
||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int width);
|
int width);
|
||||||
void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr,
|
void ARGBToRGB24Row_Any_AVX512VBMI(const uint8_t* src_ptr,
|
||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int width);
|
int width);
|
||||||
void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr,
|
void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr,
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1703
|
#define LIBYUV_VERSION 1704
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|||||||
@ -887,6 +887,14 @@ int ARGBToRGB24(const uint8_t* src_argb,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
|
||||||
|
if (TestCpuFlag(kCpuHasAVX512VBMI)) {
|
||||||
|
ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX512VBMI;
|
||||||
|
if (IS_ALIGNED(width, 32)) {
|
||||||
|
ARGBToRGB24Row = ARGBToRGB24Row_AVX512VBMI;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if defined(HAS_ARGBTORGB24ROW_NEON)
|
#if defined(HAS_ARGBTORGB24ROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
|
ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
|
||||||
|
|||||||
@ -447,6 +447,9 @@ ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
|
|||||||
#if defined(HAS_ARGBTORGB24ROW_AVX2)
|
#if defined(HAS_ARGBTORGB24ROW_AVX2)
|
||||||
ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31)
|
ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31)
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
|
||||||
|
ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31)
|
||||||
|
#endif
|
||||||
#if defined(HAS_ARGBTORAWROW_AVX2)
|
#if defined(HAS_ARGBTORAWROW_AVX2)
|
||||||
ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31)
|
ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31)
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -552,6 +552,54 @@ void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
|||||||
"xmm7");
|
"xmm7");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO(fbarchard): Detect compiler can di avx512 and add ifdefs
|
||||||
|
|
||||||
|
// Shuffle table for converting ARGBToRGB24
|
||||||
|
static const ulvec8 kPermARGBToRGB24_0 = {
|
||||||
|
0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u,
|
||||||
|
14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
|
||||||
|
29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
|
||||||
|
static const ulvec8 kPermARGBToRGB24_1 = {
|
||||||
|
10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
|
||||||
|
25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
|
||||||
|
40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
|
||||||
|
static const ulvec8 kPermARGBToRGB24_2 = {
|
||||||
|
21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
|
||||||
|
36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
|
||||||
|
50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
|
||||||
|
|
||||||
|
void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
|
||||||
|
asm volatile(
|
||||||
|
"vmovdqa %3,%%ymm5 \n"
|
||||||
|
"vmovdqa %4,%%ymm6 \n"
|
||||||
|
"vmovdqa %5,%%ymm7 \n"
|
||||||
|
|
||||||
|
LABELALIGN
|
||||||
|
"1: \n"
|
||||||
|
"vmovdqu (%0),%%ymm0 \n"
|
||||||
|
"vmovdqu 0x20(%0),%%ymm1 \n"
|
||||||
|
"vmovdqu 0x40(%0),%%ymm2 \n"
|
||||||
|
"vmovdqu 0x60(%0),%%ymm3 \n"
|
||||||
|
"lea 0x80(%0),%0 \n"
|
||||||
|
"vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
|
||||||
|
"vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
|
||||||
|
"vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
|
||||||
|
"vmovdqu %%ymm0,(%1) \n"
|
||||||
|
"vmovdqu %%ymm1,0x20(%1) \n"
|
||||||
|
"vmovdqu %%ymm2,0x40(%1) \n"
|
||||||
|
"lea 0x60(%1),%1 \n"
|
||||||
|
"sub $0x20,%2 \n"
|
||||||
|
"jg 1b \n"
|
||||||
|
"vzeroupper \n"
|
||||||
|
: "+r"(src), // %0
|
||||||
|
"+r"(dst), // %1
|
||||||
|
"+r"(width) // %2
|
||||||
|
: "m"(kPermARGBToRGB24_0), // %3
|
||||||
|
"m"(kPermARGBToRGB24_1), // %4
|
||||||
|
"m"(kPermARGBToRGB24_2) // %5
|
||||||
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
|
||||||
|
}
|
||||||
|
|
||||||
void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"vbroadcastf128 %3,%%ymm6 \n"
|
"vbroadcastf128 %3,%%ymm6 \n"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user