mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-06-15 00:16:08 +08:00
Mirrow AVX2 functions for Visual C
Bug: libyuv:42280902 Change-Id: Iabbec9af3a4f4dd89294e60145823c7fc4dd6ec6 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7843378 Commit-Queue: Frank Barchard <fbarchard@google.com> Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
parent
0f320a03f7
commit
cda55fcf53
@ -10,7 +10,7 @@ Libyuv uses a dispatch system where high-level conversion functions call optimiz
|
|||||||
|
|
||||||
### x86 Architectures (32-bit and 64-bit)
|
### x86 Architectures (32-bit and 64-bit)
|
||||||
|
|
||||||
* **row_gcc.cc**: **Master copy.** Contains inline assembly in GCC syntax for GCC and Clang. Supports SSE, AVX, and AVX512. AVX512 implementations are strictly for 64-bit targets.
|
* **row_gcc.cc**: **Master copy.** Contains inline assembly in GCC syntax for GCC and Clang. Supports AVX, and AVX512. AVX512 implementations are strictly for 64-bit targets.
|
||||||
* **row_win.cc**: Derivative of `row_gcc.cc`. Contains C++ intrinsics specifically for Visual C++ (MSVC). Can be tested with Clang using `-DLIBYUV_ENABLE_ROWWIN`.
|
* **row_win.cc**: Derivative of `row_gcc.cc`. Contains C++ intrinsics specifically for Visual C++ (MSVC). Can be tested with Clang using `-DLIBYUV_ENABLE_ROWWIN`.
|
||||||
* **Note**: Use either `row_gcc` or `row_win`, never both.
|
* **Note**: Use either `row_gcc` or `row_win`, never both.
|
||||||
|
|
||||||
@ -33,10 +33,8 @@ Libyuv uses a dispatch system where high-level conversion functions call optimiz
|
|||||||
|
|
||||||
## Coding Guidelines
|
## Coding Guidelines
|
||||||
|
|
||||||
1. **Maintain Parity**: When modifying `row_common.cc`, update corresponding optimizations. `row_gcc.cc` is the master; `row_win.cc` should be updated to match.
|
1. **AVX512 Logic**: AVX512 row functions are strictly enabled for **64-bit x86 only**.
|
||||||
2. **Assembly Safety**: Ensure inline assembly in `row_gcc.cc` correctly preserves registers according to the ABI.
|
2. **Feature Macros**: Use the `HAS_` macros in `include/libyuv/row.h` to enable or disable specific AVX512 versions.
|
||||||
3. **AVX512 Logic**: AVX512 row functions are strictly enabled for **64-bit x86 only**.
|
|
||||||
4. **Feature Macros**: Use the `HAS_` macros in `include/libyuv/row.h` to enable or disable specific AVX512 versions.
|
|
||||||
|
|
||||||
## Changelist (CL) & Commit Guidelines
|
## Changelist (CL) & Commit Guidelines
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: https://chromium.googlesource.com/libyuv/libyuv/
|
URL: https://chromium.googlesource.com/libyuv/libyuv/
|
||||||
Version: 1938
|
Version: 1939
|
||||||
Revision: DEPS
|
Revision: DEPS
|
||||||
License: BSD-3-Clause
|
License: BSD-3-Clause
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|||||||
@ -72,7 +72,6 @@ Additional commonly used compiler options can be passed to Bazel via `--copt`:
|
|||||||
|
|
||||||
bazel build -c opt --config=android_arm64 \
|
bazel build -c opt --config=android_arm64 \
|
||||||
--copt=-DLIBYUV_UNLIMITED_DATA \
|
--copt=-DLIBYUV_UNLIMITED_DATA \
|
||||||
--copt=-DLIBYUV_BIT_EXACT=1 \
|
|
||||||
--copt=-DENABLE_ROW_TESTS \
|
--copt=-DENABLE_ROW_TESTS \
|
||||||
//:libyuv_test
|
//:libyuv_test
|
||||||
|
|
||||||
|
|||||||
@ -142,8 +142,13 @@ extern "C" {
|
|||||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||||
(defined(__x86_64__) || defined(__i386__) || \
|
(defined(__x86_64__) || defined(__i386__) || \
|
||||||
defined(_M_X64) || defined(_M_X86))
|
defined(_M_X64) || defined(_M_X86))
|
||||||
|
#define HAS_ARGBMIRRORROW_AVX2
|
||||||
|
#define HAS_RGB24MIRRORROW_AVX2
|
||||||
#define HAS_ARGBTOUVMATRIXROW_AVX2
|
#define HAS_ARGBTOUVMATRIXROW_AVX2
|
||||||
#define HAS_MERGEUVROW_AVX2
|
#define HAS_MERGEUVROW_AVX2
|
||||||
|
#define HAS_MIRRORROW_AVX2
|
||||||
|
#define HAS_MIRRORSPLITUVROW_AVX2
|
||||||
|
#define HAS_MIRRORUVROW_AVX2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \
|
#if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \
|
||||||
@ -170,6 +175,7 @@ extern "C" {
|
|||||||
#define HAS_INTERPOLATEROW_AVX2
|
#define HAS_INTERPOLATEROW_AVX2
|
||||||
#define HAS_J422TOARGBROW_AVX2
|
#define HAS_J422TOARGBROW_AVX2
|
||||||
#define HAS_MIRRORROW_AVX2
|
#define HAS_MIRRORROW_AVX2
|
||||||
|
#define HAS_MIRRORSPLITUVROW_AVX2
|
||||||
#define HAS_NV12TOARGBROW_AVX2
|
#define HAS_NV12TOARGBROW_AVX2
|
||||||
#define HAS_NV12TORGB24ROW_AVX2
|
#define HAS_NV12TORGB24ROW_AVX2
|
||||||
#define HAS_NV12TORGB565ROW_AVX2
|
#define HAS_NV12TORGB565ROW_AVX2
|
||||||
@ -236,7 +242,6 @@ extern "C" {
|
|||||||
#define HAS_P410TOARGBROW_SSSE3
|
#define HAS_P410TOARGBROW_SSSE3
|
||||||
#define HAS_RAWTOARGBROW_AVX2
|
#define HAS_RAWTOARGBROW_AVX2
|
||||||
#define HAS_RAWTORGBAROW_SSSE3
|
#define HAS_RAWTORGBAROW_SSSE3
|
||||||
#define HAS_RGB24MIRRORROW_SSSE3
|
|
||||||
#define HAS_RGBATOYJROW_SSSE3
|
#define HAS_RGBATOYJROW_SSSE3
|
||||||
#define HAS_SPLITARGBROW_SSE2
|
#define HAS_SPLITARGBROW_SSE2
|
||||||
#define HAS_SPLITARGBROW_SSSE3
|
#define HAS_SPLITARGBROW_SSSE3
|
||||||
@ -2939,15 +2944,14 @@ void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width);
|
|||||||
void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width);
|
void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width);
|
||||||
void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
|
void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
|
||||||
void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||||
void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
|
||||||
void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||||
void MirrorUVRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
void MirrorUVRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||||
void MirrorUVRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
void MirrorUVRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||||
|
|
||||||
void MirrorSplitUVRow_SSSE3(const uint8_t* src,
|
void MirrorSplitUVRow_AVX2(const uint8_t* src,
|
||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width);
|
int width);
|
||||||
void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
|
void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
|
||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
@ -2983,16 +2987,16 @@ void ARGBMirrorRow_Any_LASX(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int width);
|
int width);
|
||||||
|
|
||||||
void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
|
void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24,
|
||||||
uint8_t* dst_rgb24,
|
uint8_t* dst_rgb24,
|
||||||
int width);
|
int width);
|
||||||
void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
|
void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
|
||||||
uint8_t* dst_rgb24,
|
uint8_t* dst_rgb24,
|
||||||
int width);
|
int width);
|
||||||
void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width);
|
void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width);
|
||||||
void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr,
|
void RGB24MirrorRow_Any_AVX2(const uint8_t* src_ptr,
|
||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int width);
|
int width);
|
||||||
void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr,
|
void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr,
|
||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int width);
|
int width);
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1938
|
#define LIBYUV_VERSION 1939
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|||||||
@ -2583,7 +2583,6 @@ void MirrorUVPlane(const uint8_t* src_uv,
|
|||||||
#endif
|
#endif
|
||||||
#if defined(HAS_MIRRORUVROW_SSSE3)
|
#if defined(HAS_MIRRORUVROW_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||||
MirrorUVRow = MirrorUVRow_Any_SSSE3;
|
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 8)) {
|
||||||
MirrorUVRow = MirrorUVRow_SSSE3;
|
MirrorUVRow = MirrorUVRow_SSSE3;
|
||||||
}
|
}
|
||||||
@ -2823,11 +2822,11 @@ int RGB24Mirror(const uint8_t* src_rgb24,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_RGB24MIRRORROW_SSSE3)
|
#if defined(HAS_RGB24MIRRORROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||||
RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3;
|
RGB24MirrorRow = RGB24MirrorRow_Any_AVX2;
|
||||||
if (IS_ALIGNED(width, 16)) {
|
if (IS_ALIGNED(width, 32)) {
|
||||||
RGB24MirrorRow = RGB24MirrorRow_SSSE3;
|
RGB24MirrorRow = RGB24MirrorRow_AVX2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -397,9 +397,9 @@ void SplitRotateUV180(const uint8_t* src,
|
|||||||
MirrorSplitUVRow = MirrorSplitUVRow_NEON;
|
MirrorSplitUVRow = MirrorSplitUVRow_NEON;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_MIRRORSPLITUVROW_SSSE3)
|
#if defined(HAS_MIRRORSPLITUVROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
|
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
|
||||||
MirrorSplitUVRow = MirrorSplitUVRow_SSSE3;
|
MirrorSplitUVRow = MirrorSplitUVRow_AVX2;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_MIRRORSPLITUVROW_LSX)
|
#if defined(HAS_MIRRORSPLITUVROW_LSX)
|
||||||
|
|||||||
@ -1906,8 +1906,8 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2,
|
|||||||
// Any 1 to 1 mirror.
|
// Any 1 to 1 mirror.
|
||||||
#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \
|
#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \
|
||||||
void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
|
void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
|
||||||
SIMD_ALIGNED(uint8_t vin[64]); \
|
SIMD_ALIGNED(uint8_t vin[128]); \
|
||||||
SIMD_ALIGNED(uint8_t vout[64]); \
|
SIMD_ALIGNED(uint8_t vout[128]); \
|
||||||
memset(vin, 0, sizeof(vin)); /* for msan */ \
|
memset(vin, 0, sizeof(vin)); /* for msan */ \
|
||||||
int r = width & MASK; \
|
int r = width & MASK; \
|
||||||
int n = width & ~MASK; \
|
int n = width & ~MASK; \
|
||||||
@ -1915,7 +1915,7 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2,
|
|||||||
ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \
|
ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \
|
||||||
} \
|
} \
|
||||||
ptrdiff_t np = n; \
|
ptrdiff_t np = n; \
|
||||||
memcpy(vin, src_ptr, r* BPP); \
|
memcpy(vin, src_ptr, r * BPP); \
|
||||||
ANY_SIMD(vin, vout, MASK + 1); \
|
ANY_SIMD(vin, vout, MASK + 1); \
|
||||||
memcpy(dst_ptr + np * BPP, vout + (MASK + 1 - r) * BPP, r * BPP); \
|
memcpy(dst_ptr + np * BPP, vout + (MASK + 1 - r) * BPP, r * BPP); \
|
||||||
}
|
}
|
||||||
@ -1938,9 +1938,6 @@ ANY11M(MirrorRow_Any_LASX, MirrorRow_LASX, 1, 63)
|
|||||||
#ifdef HAS_MIRRORUVROW_AVX2
|
#ifdef HAS_MIRRORUVROW_AVX2
|
||||||
ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
|
ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_MIRRORUVROW_SSSE3
|
|
||||||
ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
|
|
||||||
#endif
|
|
||||||
#ifdef HAS_MIRRORUVROW_NEON
|
#ifdef HAS_MIRRORUVROW_NEON
|
||||||
ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
|
ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
|
||||||
#endif
|
#endif
|
||||||
@ -1965,8 +1962,8 @@ ANY11M(ARGBMirrorRow_Any_LSX, ARGBMirrorRow_LSX, 4, 7)
|
|||||||
#ifdef HAS_ARGBMIRRORROW_LASX
|
#ifdef HAS_ARGBMIRRORROW_LASX
|
||||||
ANY11M(ARGBMirrorRow_Any_LASX, ARGBMirrorRow_LASX, 4, 15)
|
ANY11M(ARGBMirrorRow_Any_LASX, ARGBMirrorRow_LASX, 4, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_RGB24MIRRORROW_SSSE3
|
#ifdef HAS_RGB24MIRRORROW_AVX2
|
||||||
ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15)
|
ANY11M(RGB24MirrorRow_Any_AVX2, RGB24MirrorRow_AVX2, 3, 31)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_RGB24MIRRORROW_NEON
|
#ifdef HAS_RGB24MIRRORROW_NEON
|
||||||
ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15)
|
ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15)
|
||||||
|
|||||||
@ -37,10 +37,6 @@ extern "C" {
|
|||||||
// LIBYUV_UNLIMITED_BT709
|
// LIBYUV_UNLIMITED_BT709
|
||||||
// LIBYUV_UNLIMITED_BT2020
|
// LIBYUV_UNLIMITED_BT2020
|
||||||
|
|
||||||
#if defined(LIBYUV_BIT_EXACT)
|
|
||||||
#define LIBYUV_UNATTENUATE_DUP 1
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// llvm x86 is poor at ternary operator, so use branchless min/max.
|
// llvm x86 is poor at ternary operator, so use branchless min/max.
|
||||||
|
|
||||||
#define USE_BRANCHLESS 1
|
#define USE_BRANCHLESS 1
|
||||||
@ -3578,12 +3574,8 @@ const uint32_t fixed_invtbl8[256] = {
|
|||||||
T(0xfc), T(0xfd), T(0xfe), 0x01000100};
|
T(0xfc), T(0xfd), T(0xfe), 0x01000100};
|
||||||
#undef T
|
#undef T
|
||||||
|
|
||||||
#if defined(LIBYUV_UNATTENUATE_DUP)
|
|
||||||
// This code mimics the Intel SIMD version for better testability.
|
// This code mimics the Intel SIMD version for better testability.
|
||||||
#define UNATTENUATE(f, ia) clamp255(((f | (f << 8)) * ia) >> 16)
|
#define UNATTENUATE(f, ia) clamp255(((f | (f << 8)) * ia) >> 16)
|
||||||
#else
|
|
||||||
#define UNATTENUATE(f, ia) clamp255((f * ia) >> 8)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// mimics the Intel SIMD code for exactness.
|
// mimics the Intel SIMD code for exactness.
|
||||||
void ARGBUnattenuateRow_C(const uint8_t* src_argb,
|
void ARGBUnattenuateRow_C(const uint8_t* src_argb,
|
||||||
|
|||||||
@ -21,6 +21,10 @@ extern "C" {
|
|||||||
(defined(__x86_64__) || defined(__i386__)) && \
|
(defined(__x86_64__) || defined(__i386__)) && \
|
||||||
!defined(LIBYUV_ENABLE_ROWWIN)
|
!defined(LIBYUV_ENABLE_ROWWIN)
|
||||||
|
|
||||||
|
// Note: for avx and avx512 declare clobber as xmm registers due to
|
||||||
|
// clang for windows needing to preserve xmm registers but not saving
|
||||||
|
// them if declared as ymm or zmm.
|
||||||
|
|
||||||
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
|
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
|
||||||
|
|
||||||
// Constants for ARGB
|
// Constants for ARGB
|
||||||
@ -268,11 +272,11 @@ static const uint32_t kPermdRAWToARGB_AVX512BW[16] = {
|
|||||||
|
|
||||||
void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint32_t* shuffler, int width) {
|
void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint32_t* shuffler, int width) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"vpternlogd $0xff,%%zmm22,%%zmm22,%%zmm22 \n" // 0xffffffff
|
"vpternlogd $0xff,%%zmm6,%%zmm6,%%zmm6 \n" // 0xffffffff
|
||||||
"vpslld $0x18,%%zmm22,%%zmm22 \n" // 0xff000000
|
"vpslld $0x18,%%zmm6,%%zmm6 \n" // 0xff000000
|
||||||
"movabs $0xffffffffffff,%%rax \n" // 48 bytes mask
|
"movabs $0xffffffffffff,%%rax \n" // 48 bytes mask
|
||||||
"kmovq %%rax,%%k1 \n"
|
"kmovq %%rax,%%k1 \n"
|
||||||
"vmovdqu32 %3,%%zmm21 \n"
|
"vmovdqu32 %3,%%zmm5 \n"
|
||||||
"vbroadcasti32x4 %4,%%zmm4 \n"
|
"vbroadcasti32x4 %4,%%zmm4 \n"
|
||||||
|
|
||||||
LABELALIGN //
|
LABELALIGN //
|
||||||
@ -282,18 +286,18 @@ void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint
|
|||||||
"vmovdqu8 96(%0),%%zmm2%{%%k1%}%{z%} \n"
|
"vmovdqu8 96(%0),%%zmm2%{%%k1%}%{z%} \n"
|
||||||
"vmovdqu8 144(%0),%%zmm3%{%%k1%}%{z%} \n"
|
"vmovdqu8 144(%0),%%zmm3%{%%k1%}%{z%} \n"
|
||||||
"lea 192(%0),%0 \n"
|
"lea 192(%0),%0 \n"
|
||||||
"vpermd %%zmm0,%%zmm21,%%zmm0 \n"
|
"vpermd %%zmm0,%%zmm5,%%zmm0 \n"
|
||||||
"vpermd %%zmm1,%%zmm21,%%zmm1 \n"
|
"vpermd %%zmm1,%%zmm5,%%zmm1 \n"
|
||||||
"vpermd %%zmm2,%%zmm21,%%zmm2 \n"
|
"vpermd %%zmm2,%%zmm5,%%zmm2 \n"
|
||||||
"vpermd %%zmm3,%%zmm21,%%zmm3 \n"
|
"vpermd %%zmm3,%%zmm5,%%zmm3 \n"
|
||||||
"vpshufb %%zmm4,%%zmm0,%%zmm0 \n"
|
"vpshufb %%zmm4,%%zmm0,%%zmm0 \n"
|
||||||
"vpshufb %%zmm4,%%zmm1,%%zmm1 \n"
|
"vpshufb %%zmm4,%%zmm1,%%zmm1 \n"
|
||||||
"vpshufb %%zmm4,%%zmm2,%%zmm2 \n"
|
"vpshufb %%zmm4,%%zmm2,%%zmm2 \n"
|
||||||
"vpshufb %%zmm4,%%zmm3,%%zmm3 \n"
|
"vpshufb %%zmm4,%%zmm3,%%zmm3 \n"
|
||||||
"vpord %%zmm22,%%zmm0,%%zmm0 \n"
|
"vpord %%zmm6,%%zmm0,%%zmm0 \n"
|
||||||
"vpord %%zmm22,%%zmm1,%%zmm1 \n"
|
"vpord %%zmm6,%%zmm1,%%zmm1 \n"
|
||||||
"vpord %%zmm22,%%zmm2,%%zmm2 \n"
|
"vpord %%zmm6,%%zmm2,%%zmm2 \n"
|
||||||
"vpord %%zmm22,%%zmm3,%%zmm3 \n"
|
"vpord %%zmm6,%%zmm3,%%zmm3 \n"
|
||||||
"vmovdqu32 %%zmm0,(%1) \n"
|
"vmovdqu32 %%zmm0,(%1) \n"
|
||||||
"vmovdqu32 %%zmm1,0x40(%1) \n"
|
"vmovdqu32 %%zmm1,0x40(%1) \n"
|
||||||
"vmovdqu32 %%zmm2,0x80(%1) \n"
|
"vmovdqu32 %%zmm2,0x80(%1) \n"
|
||||||
@ -307,7 +311,7 @@ void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint
|
|||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
: "m"(kPermdRAWToARGB_AVX512BW), // %3
|
: "m"(kPermdRAWToARGB_AVX512BW), // %3
|
||||||
"m"(*shuffler) // %4
|
"m"(*shuffler) // %4
|
||||||
: "memory", "cc", "rax", "k1", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm21", "xmm22");
|
: "memory", "cc", "rax", "k1", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
||||||
}
|
}
|
||||||
|
|
||||||
void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
|
void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
|
||||||
@ -1568,26 +1572,26 @@ void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb,
|
|||||||
const struct ArgbConstants* c) {
|
const struct ArgbConstants* c) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n"
|
"vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n"
|
||||||
"vpsllw $15,%%zmm16,%%zmm21 \n"
|
"vpsllw $15,%%zmm16,%%zmm5 \n"
|
||||||
"vpacksswb %%zmm21,%%zmm21,%%zmm21 \n"
|
"vpacksswb %%zmm5,%%zmm5,%%zmm5 \n"
|
||||||
"vpsrlw $15,%%zmm16,%%zmm16 \n" // zmm16 = 1
|
"vpsrlw $15,%%zmm16,%%zmm16 \n" // zmm16 = 1
|
||||||
"vbroadcasti64x4 0(%3),%%zmm4 \n"
|
"vbroadcasti64x4 0(%3),%%zmm4 \n"
|
||||||
"vbroadcasti64x4 0x60(%3),%%zmm23 \n"
|
"vbroadcasti64x4 0x60(%3),%%zmm7 \n"
|
||||||
"vpmaddubsw %%zmm21,%%zmm4,%%zmm22 \n"
|
"vpmaddubsw %%zmm5,%%zmm4,%%zmm6 \n"
|
||||||
"vpmaddwd %%zmm16,%%zmm22,%%zmm22 \n"
|
"vpmaddwd %%zmm16,%%zmm6,%%zmm6 \n"
|
||||||
"vpackssdw %%zmm22,%%zmm22,%%zmm22 \n"
|
"vpackssdw %%zmm6,%%zmm6,%%zmm6 \n"
|
||||||
"vpsubw %%zmm22,%%zmm23,%%zmm23 \n"
|
"vpsubw %%zmm6,%%zmm7,%%zmm7 \n"
|
||||||
"vmovups %4,%%zmm22 \n"
|
"vmovups %4,%%zmm6 \n"
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vmovups (%0),%%zmm0 \n"
|
"vmovups (%0),%%zmm0 \n"
|
||||||
"vmovups 0x40(%0),%%zmm1 \n"
|
"vmovups 0x40(%0),%%zmm1 \n"
|
||||||
"vmovups 0x80(%0),%%zmm2 \n"
|
"vmovups 0x80(%0),%%zmm2 \n"
|
||||||
"vmovups 0xc0(%0),%%zmm3 \n"
|
"vmovups 0xc0(%0),%%zmm3 \n"
|
||||||
"vpsubb %%zmm21,%%zmm0,%%zmm0 \n"
|
"vpsubb %%zmm5,%%zmm0,%%zmm0 \n"
|
||||||
"vpsubb %%zmm21,%%zmm1,%%zmm1 \n"
|
"vpsubb %%zmm5,%%zmm1,%%zmm1 \n"
|
||||||
"vpsubb %%zmm21,%%zmm2,%%zmm2 \n"
|
"vpsubb %%zmm5,%%zmm2,%%zmm2 \n"
|
||||||
"vpsubb %%zmm21,%%zmm3,%%zmm3 \n"
|
"vpsubb %%zmm5,%%zmm3,%%zmm3 \n"
|
||||||
"vpmaddubsw %%zmm0,%%zmm4,%%zmm0 \n"
|
"vpmaddubsw %%zmm0,%%zmm4,%%zmm0 \n"
|
||||||
"vpmaddubsw %%zmm1,%%zmm4,%%zmm1 \n"
|
"vpmaddubsw %%zmm1,%%zmm4,%%zmm1 \n"
|
||||||
"vpmaddubsw %%zmm2,%%zmm4,%%zmm2 \n"
|
"vpmaddubsw %%zmm2,%%zmm4,%%zmm2 \n"
|
||||||
@ -1599,12 +1603,12 @@ void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb,
|
|||||||
"vpmaddwd %%zmm16,%%zmm2,%%zmm2 \n"
|
"vpmaddwd %%zmm16,%%zmm2,%%zmm2 \n"
|
||||||
"vpmaddwd %%zmm16,%%zmm3,%%zmm3 \n"
|
"vpmaddwd %%zmm16,%%zmm3,%%zmm3 \n"
|
||||||
"vpackssdw %%zmm3,%%zmm2,%%zmm2 \n"
|
"vpackssdw %%zmm3,%%zmm2,%%zmm2 \n"
|
||||||
"vpaddw %%zmm23,%%zmm0,%%zmm0 \n"
|
"vpaddw %%zmm7,%%zmm0,%%zmm0 \n"
|
||||||
"vpaddw %%zmm23,%%zmm2,%%zmm2 \n"
|
"vpaddw %%zmm7,%%zmm2,%%zmm2 \n"
|
||||||
"vpsrlw $0x8,%%zmm0,%%zmm0 \n"
|
"vpsrlw $0x8,%%zmm0,%%zmm0 \n"
|
||||||
"vpsrlw $0x8,%%zmm2,%%zmm2 \n"
|
"vpsrlw $0x8,%%zmm2,%%zmm2 \n"
|
||||||
"vpackuswb %%zmm2,%%zmm0,%%zmm0 \n"
|
"vpackuswb %%zmm2,%%zmm0,%%zmm0 \n"
|
||||||
"vpermd %%zmm0,%%zmm22,%%zmm0 \n"
|
"vpermd %%zmm0,%%zmm6,%%zmm0 \n"
|
||||||
"vmovups %%zmm0,(%1) \n"
|
"vmovups %%zmm0,(%1) \n"
|
||||||
"lea 0x40(%1),%1 \n"
|
"lea 0x40(%1),%1 \n"
|
||||||
"sub $0x40,%2 \n"
|
"sub $0x40,%2 \n"
|
||||||
@ -1615,8 +1619,8 @@ void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb,
|
|||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
: "r"(c), // %3
|
: "r"(c), // %3
|
||||||
"m"(kPermdARGBToY_AVX512BW) // %4
|
"m"(kPermdARGBToY_AVX512BW) // %4
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm16", "xmm21",
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||||
"xmm22", "xmm23");
|
"xmm7", "xmm16");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -1773,8 +1777,8 @@ void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb,
|
|||||||
"vbroadcasti64x4 0x20(%4),%%zmm3 \n" // kRGBToU
|
"vbroadcasti64x4 0x20(%4),%%zmm3 \n" // kRGBToU
|
||||||
"vbroadcasti64x4 0x40(%4),%%zmm4 \n" // kRGBToV
|
"vbroadcasti64x4 0x40(%4),%%zmm4 \n" // kRGBToV
|
||||||
"vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n" // -1
|
"vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n" // -1
|
||||||
"vpsllw $15,%%zmm16,%%zmm21 \n" // 0x8000
|
"vpsllw $15,%%zmm16,%%zmm5 \n" // 0x8000
|
||||||
"vmovups %5,%%zmm23 \n"
|
"vmovups %5,%%zmm7 \n"
|
||||||
"sub %1,%2 \n"
|
"sub %1,%2 \n"
|
||||||
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
@ -1782,45 +1786,45 @@ void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb,
|
|||||||
"vmovups (%0),%%zmm0 \n"
|
"vmovups (%0),%%zmm0 \n"
|
||||||
"vmovups 0x40(%0),%%zmm1 \n"
|
"vmovups 0x40(%0),%%zmm1 \n"
|
||||||
"vmovups 0x80(%0),%%zmm2 \n"
|
"vmovups 0x80(%0),%%zmm2 \n"
|
||||||
"vmovups 0xc0(%0),%%zmm22 \n"
|
"vmovups 0xc0(%0),%%zmm6 \n"
|
||||||
"vpmaddubsw %%zmm3,%%zmm0,%%zmm0 \n"
|
"vpmaddubsw %%zmm3,%%zmm0,%%zmm0 \n"
|
||||||
"vpmaddubsw %%zmm3,%%zmm1,%%zmm1 \n"
|
"vpmaddubsw %%zmm3,%%zmm1,%%zmm1 \n"
|
||||||
"vpmaddubsw %%zmm3,%%zmm2,%%zmm2 \n"
|
"vpmaddubsw %%zmm3,%%zmm2,%%zmm2 \n"
|
||||||
"vpmaddubsw %%zmm3,%%zmm22,%%zmm22 \n"
|
"vpmaddubsw %%zmm3,%%zmm6,%%zmm6 \n"
|
||||||
"vpmaddwd %%zmm16,%%zmm0,%%zmm0 \n"
|
"vpmaddwd %%zmm16,%%zmm0,%%zmm0 \n"
|
||||||
"vpmaddwd %%zmm16,%%zmm1,%%zmm1 \n"
|
"vpmaddwd %%zmm16,%%zmm1,%%zmm1 \n"
|
||||||
"vpmaddwd %%zmm16,%%zmm2,%%zmm2 \n"
|
"vpmaddwd %%zmm16,%%zmm2,%%zmm2 \n"
|
||||||
"vpmaddwd %%zmm16,%%zmm22,%%zmm22 \n"
|
"vpmaddwd %%zmm16,%%zmm6,%%zmm6 \n"
|
||||||
"vpackssdw %%zmm1,%%zmm0,%%zmm0 \n" // mutates
|
"vpackssdw %%zmm1,%%zmm0,%%zmm0 \n" // mutates
|
||||||
"vpackssdw %%zmm22,%%zmm2,%%zmm2 \n"
|
"vpackssdw %%zmm6,%%zmm2,%%zmm2 \n"
|
||||||
"vpsubw %%zmm21,%%zmm0,%%zmm0 \n"
|
"vpsubw %%zmm5,%%zmm0,%%zmm0 \n"
|
||||||
"vpsubw %%zmm21,%%zmm2,%%zmm2 \n"
|
"vpsubw %%zmm5,%%zmm2,%%zmm2 \n"
|
||||||
"vpsrlw $0x8,%%zmm0,%%zmm0 \n"
|
"vpsrlw $0x8,%%zmm0,%%zmm0 \n"
|
||||||
"vpsrlw $0x8,%%zmm2,%%zmm2 \n"
|
"vpsrlw $0x8,%%zmm2,%%zmm2 \n"
|
||||||
"vpackuswb %%zmm2,%%zmm0,%%zmm0 \n" // mutates
|
"vpackuswb %%zmm2,%%zmm0,%%zmm0 \n" // mutates
|
||||||
"vpermd %%zmm0,%%zmm23,%%zmm0 \n" // unmutate.
|
"vpermd %%zmm0,%%zmm7,%%zmm0 \n" // unmutate.
|
||||||
"vmovups %%zmm0,(%1) \n"
|
"vmovups %%zmm0,(%1) \n"
|
||||||
|
|
||||||
"vmovups (%0),%%zmm0 \n"
|
"vmovups (%0),%%zmm0 \n"
|
||||||
"vmovups 0x40(%0),%%zmm1 \n"
|
"vmovups 0x40(%0),%%zmm1 \n"
|
||||||
"vmovups 0x80(%0),%%zmm2 \n"
|
"vmovups 0x80(%0),%%zmm2 \n"
|
||||||
"vmovups 0xc0(%0),%%zmm22 \n"
|
"vmovups 0xc0(%0),%%zmm6 \n"
|
||||||
"vpmaddubsw %%zmm4,%%zmm0,%%zmm0 \n"
|
"vpmaddubsw %%zmm4,%%zmm0,%%zmm0 \n"
|
||||||
"vpmaddubsw %%zmm4,%%zmm1,%%zmm1 \n"
|
"vpmaddubsw %%zmm4,%%zmm1,%%zmm1 \n"
|
||||||
"vpmaddubsw %%zmm4,%%zmm2,%%zmm2 \n"
|
"vpmaddubsw %%zmm4,%%zmm2,%%zmm2 \n"
|
||||||
"vpmaddubsw %%zmm4,%%zmm22,%%zmm22 \n"
|
"vpmaddubsw %%zmm4,%%zmm6,%%zmm6 \n"
|
||||||
"vpmaddwd %%zmm16,%%zmm0,%%zmm0 \n"
|
"vpmaddwd %%zmm16,%%zmm0,%%zmm0 \n"
|
||||||
"vpmaddwd %%zmm16,%%zmm1,%%zmm1 \n"
|
"vpmaddwd %%zmm16,%%zmm1,%%zmm1 \n"
|
||||||
"vpmaddwd %%zmm16,%%zmm2,%%zmm2 \n"
|
"vpmaddwd %%zmm16,%%zmm2,%%zmm2 \n"
|
||||||
"vpmaddwd %%zmm16,%%zmm22,%%zmm22 \n"
|
"vpmaddwd %%zmm16,%%zmm6,%%zmm6 \n"
|
||||||
"vpackssdw %%zmm1,%%zmm0,%%zmm0 \n" // mutates
|
"vpackssdw %%zmm1,%%zmm0,%%zmm0 \n" // mutates
|
||||||
"vpackssdw %%zmm22,%%zmm2,%%zmm2 \n"
|
"vpackssdw %%zmm6,%%zmm2,%%zmm2 \n"
|
||||||
"vpsubw %%zmm21,%%zmm0,%%zmm0 \n"
|
"vpsubw %%zmm5,%%zmm0,%%zmm0 \n"
|
||||||
"vpsubw %%zmm21,%%zmm2,%%zmm2 \n"
|
"vpsubw %%zmm5,%%zmm2,%%zmm2 \n"
|
||||||
"vpsrlw $0x8,%%zmm0,%%zmm0 \n"
|
"vpsrlw $0x8,%%zmm0,%%zmm0 \n"
|
||||||
"vpsrlw $0x8,%%zmm2,%%zmm2 \n"
|
"vpsrlw $0x8,%%zmm2,%%zmm2 \n"
|
||||||
"vpackuswb %%zmm2,%%zmm0,%%zmm0 \n" // mutates
|
"vpackuswb %%zmm2,%%zmm0,%%zmm0 \n" // mutates
|
||||||
"vpermd %%zmm0,%%zmm23,%%zmm0 \n" // unmutate.
|
"vpermd %%zmm0,%%zmm7,%%zmm0 \n" // unmutate.
|
||||||
"vmovups %%zmm0,(%1,%2,1) \n"
|
"vmovups %%zmm0,(%1,%2,1) \n"
|
||||||
"lea 0x100(%0),%0 \n"
|
"lea 0x100(%0),%0 \n"
|
||||||
"lea 0x40(%1),%1 \n"
|
"lea 0x40(%1),%1 \n"
|
||||||
@ -1837,8 +1841,8 @@ void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb,
|
|||||||
#endif
|
#endif
|
||||||
: "r"(c), // %4
|
: "r"(c), // %4
|
||||||
"m"(kPermdARGBToY_AVX512BW) // %5
|
"m"(kPermdARGBToY_AVX512BW) // %5
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm16", "xmm21",
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||||
"xmm22", "xmm23");
|
"xmm7", "xmm16");
|
||||||
}
|
}
|
||||||
#endif // HAS_ARGBTOUV444ROW_AVX512BW
|
#endif // HAS_ARGBTOUV444ROW_AVX512BW
|
||||||
|
|
||||||
@ -2233,11 +2237,11 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb,
|
|||||||
const struct ArgbConstants* c) {
|
const struct ArgbConstants* c) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"vbroadcasti64x4 0x20(%5),%%zmm4 \n" // RGBToU
|
"vbroadcasti64x4 0x20(%5),%%zmm4 \n" // RGBToU
|
||||||
"vbroadcasti64x4 0x40(%5),%%zmm21 \n" // RGBToV
|
"vbroadcasti64x4 0x40(%5),%%zmm5 \n" // RGBToV
|
||||||
"vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n"
|
"vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n"
|
||||||
"vpabsb %%zmm16,%%zmm22 \n" // 0x0101
|
"vpabsb %%zmm16,%%zmm6 \n" // 0x0101
|
||||||
"vpsllw $15,%%zmm16,%%zmm17 \n" // 0x8000
|
"vpsllw $15,%%zmm16,%%zmm17 \n" // 0x8000
|
||||||
"vbroadcasti64x4 %6,%%zmm23 \n" // kShuffleAARRGGBB
|
"vbroadcasti64x4 %6,%%zmm7 \n" // kShuffleAARRGGBB
|
||||||
"vmovups %7,%%zmm18 \n" // kPermdARGBToY_AVX512BW
|
"vmovups %7,%%zmm18 \n" // kPermdARGBToY_AVX512BW
|
||||||
"vmovups %8,%%zmm19 \n" // kPermdARGBToUV_AVX512BW
|
"vmovups %8,%%zmm19 \n" // kPermdARGBToUV_AVX512BW
|
||||||
"sub %1,%2 \n"
|
"sub %1,%2 \n"
|
||||||
@ -2248,14 +2252,14 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb,
|
|||||||
"vmovups 0x40(%0),%%zmm1 \n"
|
"vmovups 0x40(%0),%%zmm1 \n"
|
||||||
"vmovups 0x00(%0,%4,1),%%zmm2 \n"
|
"vmovups 0x00(%0,%4,1),%%zmm2 \n"
|
||||||
"vmovups 0x40(%0,%4,1),%%zmm3 \n"
|
"vmovups 0x40(%0,%4,1),%%zmm3 \n"
|
||||||
"vpshufb %%zmm23,%%zmm0,%%zmm0 \n" // aarrggbb
|
"vpshufb %%zmm7,%%zmm0,%%zmm0 \n" // aarrggbb
|
||||||
"vpshufb %%zmm23,%%zmm1,%%zmm1 \n"
|
"vpshufb %%zmm7,%%zmm1,%%zmm1 \n"
|
||||||
"vpshufb %%zmm23,%%zmm2,%%zmm2 \n"
|
"vpshufb %%zmm7,%%zmm2,%%zmm2 \n"
|
||||||
"vpshufb %%zmm23,%%zmm3,%%zmm3 \n"
|
"vpshufb %%zmm7,%%zmm3,%%zmm3 \n"
|
||||||
"vpmaddubsw %%zmm22,%%zmm0,%%zmm0 \n" // 32x2 -> 16x2
|
"vpmaddubsw %%zmm6,%%zmm0,%%zmm0 \n" // 32x2 -> 16x2
|
||||||
"vpmaddubsw %%zmm22,%%zmm1,%%zmm1 \n"
|
"vpmaddubsw %%zmm6,%%zmm1,%%zmm1 \n"
|
||||||
"vpmaddubsw %%zmm22,%%zmm2,%%zmm2 \n"
|
"vpmaddubsw %%zmm6,%%zmm2,%%zmm2 \n"
|
||||||
"vpmaddubsw %%zmm22,%%zmm3,%%zmm3 \n"
|
"vpmaddubsw %%zmm6,%%zmm3,%%zmm3 \n"
|
||||||
"vpaddw %%zmm0,%%zmm2,%%zmm0 \n" // 16x2 -> 16x1
|
"vpaddw %%zmm0,%%zmm2,%%zmm0 \n" // 16x2 -> 16x1
|
||||||
"vpaddw %%zmm1,%%zmm3,%%zmm1 \n"
|
"vpaddw %%zmm1,%%zmm3,%%zmm1 \n"
|
||||||
"vpxorq %%zmm2,%%zmm2,%%zmm2 \n" // 0 for vpavgw
|
"vpxorq %%zmm2,%%zmm2,%%zmm2 \n" // 0 for vpavgw
|
||||||
@ -2267,7 +2271,7 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb,
|
|||||||
"vpermd %%zmm0,%%zmm19,%%zmm0 \n" // unscramble pixels
|
"vpermd %%zmm0,%%zmm19,%%zmm0 \n" // unscramble pixels
|
||||||
|
|
||||||
"vpmaddubsw %%zmm4,%%zmm0,%%zmm1 \n" // 16 U
|
"vpmaddubsw %%zmm4,%%zmm0,%%zmm1 \n" // 16 U
|
||||||
"vpmaddubsw %%zmm21,%%zmm0,%%zmm0 \n" // 16 V
|
"vpmaddubsw %%zmm5,%%zmm0,%%zmm0 \n" // 16 V
|
||||||
"vpmaddwd %%zmm16,%%zmm1,%%zmm1 \n"
|
"vpmaddwd %%zmm16,%%zmm1,%%zmm1 \n"
|
||||||
"vpmaddwd %%zmm16,%%zmm0,%%zmm0 \n"
|
"vpmaddwd %%zmm16,%%zmm0,%%zmm0 \n"
|
||||||
"vpackssdw %%zmm0,%%zmm1,%%zmm0 \n" // mutates (U in lower, V in upper)
|
"vpackssdw %%zmm0,%%zmm1,%%zmm0 \n" // mutates (U in lower, V in upper)
|
||||||
@ -2298,8 +2302,8 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb,
|
|||||||
"m"(kShuffleAARRGGBB), // %6
|
"m"(kShuffleAARRGGBB), // %6
|
||||||
"m"(kPermdARGBToY_AVX512BW), // %7
|
"m"(kPermdARGBToY_AVX512BW), // %7
|
||||||
"m"(kPermdARGBToUV_AVX512BW) // %8
|
"m"(kPermdARGBToUV_AVX512BW) // %8
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm16", "xmm17",
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||||
"xmm18", "xmm19", "xmm21", "xmm22", "xmm23");
|
"xmm7", "xmm16", "xmm17", "xmm18", "xmm19");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ARGBToUVRow_AVX512BW(const uint8_t* src_argb,
|
void ARGBToUVRow_AVX512BW(const uint8_t* src_argb,
|
||||||
@ -3626,14 +3630,14 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
|
|||||||
"vmovdqa 128(%[yuvconstants]),%%ymm12 \n"
|
"vmovdqa 128(%[yuvconstants]),%%ymm12 \n"
|
||||||
|
|
||||||
#define YUVTORGB_SETUP_AVX512BW(yuvconstants) \
|
#define YUVTORGB_SETUP_AVX512BW(yuvconstants) \
|
||||||
"vpternlogd $0xff,%%zmm29,%%zmm29,%%zmm29 \n" \
|
"vpternlogd $0xff,%%zmm13,%%zmm13,%%zmm13 \n" \
|
||||||
"vpbroadcastq (%[yuvconstants]),%%zmm24 \n" \
|
"vpbroadcastq (%[yuvconstants]),%%zmm8 \n" \
|
||||||
"vpabsb %%zmm29,%%zmm29 \n" \
|
"vpabsb %%zmm13,%%zmm13 \n" \
|
||||||
"vpsllw $7,%%zmm29,%%zmm29 \n" \
|
"vpsllw $7,%%zmm13,%%zmm13 \n" \
|
||||||
"vpbroadcastq 32(%[yuvconstants]),%%zmm25 \n" \
|
"vpbroadcastq 32(%[yuvconstants]),%%zmm9 \n" \
|
||||||
"vpbroadcastq 64(%[yuvconstants]),%%zmm26 \n" \
|
"vpbroadcastq 64(%[yuvconstants]),%%zmm10 \n" \
|
||||||
"vpbroadcastq 96(%[yuvconstants]),%%zmm27 \n" \
|
"vpbroadcastq 96(%[yuvconstants]),%%zmm11 \n" \
|
||||||
"vpbroadcastq 128(%[yuvconstants]),%%zmm28 \n" \
|
"vpbroadcastq 128(%[yuvconstants]),%%zmm12 \n" \
|
||||||
"vmovups (%[quadsplitperm]),%%zmm16 \n" \
|
"vmovups (%[quadsplitperm]),%%zmm16 \n" \
|
||||||
"vmovups (%[dquadsplitperm]),%%zmm17 \n" \
|
"vmovups (%[dquadsplitperm]),%%zmm17 \n" \
|
||||||
"vmovups (%[unperm]),%%zmm18 \n"
|
"vmovups (%[unperm]),%%zmm18 \n"
|
||||||
@ -3650,12 +3654,12 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
|
|||||||
"vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
|
"vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
|
||||||
|
|
||||||
#define YUVTORGB16_AVX512BW(yuvconstants) \
|
#define YUVTORGB16_AVX512BW(yuvconstants) \
|
||||||
"vpsubb %%zmm29,%%zmm3,%%zmm3 \n" \
|
"vpsubb %%zmm13,%%zmm3,%%zmm3 \n" \
|
||||||
"vpmulhuw %%zmm27,%%zmm4,%%zmm4 \n" \
|
"vpmulhuw %%zmm11,%%zmm4,%%zmm4 \n" \
|
||||||
"vpmaddubsw %%zmm3,%%zmm24,%%zmm0 \n" \
|
"vpmaddubsw %%zmm3,%%zmm8,%%zmm0 \n" \
|
||||||
"vpmaddubsw %%zmm3,%%zmm25,%%zmm1 \n" \
|
"vpmaddubsw %%zmm3,%%zmm9,%%zmm1 \n" \
|
||||||
"vpmaddubsw %%zmm3,%%zmm26,%%zmm2 \n" \
|
"vpmaddubsw %%zmm3,%%zmm10,%%zmm2 \n" \
|
||||||
"vpaddw %%zmm4,%%zmm28,%%zmm4 \n" \
|
"vpaddw %%zmm4,%%zmm12,%%zmm4 \n" \
|
||||||
"vpaddsw %%zmm4,%%zmm0,%%zmm0 \n" \
|
"vpaddsw %%zmm4,%%zmm0,%%zmm0 \n" \
|
||||||
"vpsubsw %%zmm1,%%zmm4,%%zmm1 \n" \
|
"vpsubsw %%zmm1,%%zmm4,%%zmm1 \n" \
|
||||||
"vpaddsw %%zmm4,%%zmm2,%%zmm2 \n"
|
"vpaddsw %%zmm4,%%zmm2,%%zmm2 \n"
|
||||||
@ -3722,7 +3726,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
|
|||||||
#define STOREARGB_AVX512BW \
|
#define STOREARGB_AVX512BW \
|
||||||
"vpunpcklbw %%zmm1,%%zmm0,%%zmm0 \n" \
|
"vpunpcklbw %%zmm1,%%zmm0,%%zmm0 \n" \
|
||||||
"vpermq %%zmm0,%%zmm18,%%zmm0 \n" \
|
"vpermq %%zmm0,%%zmm18,%%zmm0 \n" \
|
||||||
"vpunpcklbw %%zmm21,%%zmm2,%%zmm2 \n" \
|
"vpunpcklbw %%zmm5,%%zmm2,%%zmm2 \n" \
|
||||||
"vpermq %%zmm2,%%zmm18,%%zmm2 \n" \
|
"vpermq %%zmm2,%%zmm18,%%zmm2 \n" \
|
||||||
"vpunpcklwd %%zmm2,%%zmm0,%%zmm1 \n" \
|
"vpunpcklwd %%zmm2,%%zmm0,%%zmm1 \n" \
|
||||||
"vpunpckhwd %%zmm2,%%zmm0,%%zmm0 \n" \
|
"vpunpckhwd %%zmm2,%%zmm0,%%zmm0 \n" \
|
||||||
@ -3844,7 +3848,7 @@ void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf,
|
|||||||
YUVTORGB_SETUP_AVX512BW(yuvconstants)
|
YUVTORGB_SETUP_AVX512BW(yuvconstants)
|
||||||
"sub %[u_buf],%[v_buf] \n"
|
"sub %[u_buf],%[v_buf] \n"
|
||||||
"vpcmpeqb %%xmm5,%%xmm5,%%xmm5 \n"
|
"vpcmpeqb %%xmm5,%%xmm5,%%xmm5 \n"
|
||||||
"vpbroadcastq %%xmm5,%%zmm21 \n"
|
"vpbroadcastq %%xmm5,%%zmm5 \n"
|
||||||
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -4685,6 +4689,43 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
|||||||
}
|
}
|
||||||
#endif // HAS_MIRRORROW_AVX2
|
#endif // HAS_MIRRORROW_AVX2
|
||||||
|
|
||||||
|
#ifdef HAS_MIRRORSPLITUVROW_AVX2
|
||||||
|
// Shuffle table for reversing the bytes of UV channels.
|
||||||
|
static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
|
||||||
|
15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
|
||||||
|
|
||||||
|
void MirrorSplitUVRow_AVX2(const uint8_t* src,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width) {
|
||||||
|
ptrdiff_t temp_width = (ptrdiff_t)(width);
|
||||||
|
asm volatile(
|
||||||
|
"vbroadcasti128 %4,%%ymm1 \n"
|
||||||
|
"lea -0x20(%0,%3,2),%0 \n"
|
||||||
|
"sub %1,%2 \n"
|
||||||
|
|
||||||
|
LABELALIGN
|
||||||
|
"1: \n"
|
||||||
|
"vmovdqu (%0),%%ymm0 \n"
|
||||||
|
"lea -0x20(%0),%0 \n"
|
||||||
|
"vpshufb %%ymm1,%%ymm0,%%ymm0 \n"
|
||||||
|
"vpermq $0x72,%%ymm0,%%ymm0 \n"
|
||||||
|
"vextracti128 $0x1,%%ymm0,%%xmm2 \n"
|
||||||
|
"vmovdqu %%xmm0,(%1) \n"
|
||||||
|
"vmovdqu %%xmm2,0x00(%1,%2,1) \n"
|
||||||
|
"lea 0x10(%1),%1 \n"
|
||||||
|
"sub $0x10,%3 \n"
|
||||||
|
"jg 1b \n"
|
||||||
|
"vzeroupper \n"
|
||||||
|
: "+r"(src), // %0
|
||||||
|
"+r"(dst_u), // %1
|
||||||
|
"+r"(dst_v), // %2
|
||||||
|
"+r"(temp_width) // %3
|
||||||
|
: "m"(kShuffleMirrorSplitUV) // %4
|
||||||
|
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
||||||
|
}
|
||||||
|
#endif // HAS_MIRRORSPLITUVROW_AVX2
|
||||||
|
|
||||||
#ifdef HAS_MIRRORUVROW_SSSE3
|
#ifdef HAS_MIRRORUVROW_SSSE3
|
||||||
// Shuffle table for reversing the UV.
|
// Shuffle table for reversing the UV.
|
||||||
static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
|
static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
|
||||||
@ -4733,38 +4774,7 @@ void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
|
|||||||
}
|
}
|
||||||
#endif // HAS_MIRRORUVROW_AVX2
|
#endif // HAS_MIRRORUVROW_AVX2
|
||||||
|
|
||||||
#ifdef HAS_MIRRORSPLITUVROW_SSSE3
|
|
||||||
// Shuffle table for reversing the bytes of UV channels.
|
|
||||||
static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
|
|
||||||
15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
|
|
||||||
void MirrorSplitUVRow_SSSE3(const uint8_t* src,
|
|
||||||
uint8_t* dst_u,
|
|
||||||
uint8_t* dst_v,
|
|
||||||
int width) {
|
|
||||||
ptrdiff_t temp_width = (ptrdiff_t)(width);
|
|
||||||
asm volatile(
|
|
||||||
"movdqa %4,%%xmm1 \n"
|
|
||||||
"lea -0x10(%0,%3,2),%0 \n"
|
|
||||||
"sub %1,%2 \n"
|
|
||||||
|
|
||||||
LABELALIGN
|
|
||||||
"1: \n"
|
|
||||||
"movdqu (%0),%%xmm0 \n"
|
|
||||||
"lea -0x10(%0),%0 \n"
|
|
||||||
"pshufb %%xmm1,%%xmm0 \n"
|
|
||||||
"movlpd %%xmm0,(%1) \n"
|
|
||||||
"movhpd %%xmm0,0x00(%1,%2,1) \n"
|
|
||||||
"lea 0x8(%1),%1 \n"
|
|
||||||
"sub $8,%3 \n"
|
|
||||||
"jg 1b \n"
|
|
||||||
: "+r"(src), // %0
|
|
||||||
"+r"(dst_u), // %1
|
|
||||||
"+r"(dst_v), // %2
|
|
||||||
"+r"(temp_width) // %3
|
|
||||||
: "m"(kShuffleMirrorSplitUV) // %4
|
|
||||||
: "memory", "cc", "xmm0", "xmm1");
|
|
||||||
}
|
|
||||||
#endif // HAS_MIRRORSPLITUVROW_SSSE3
|
|
||||||
|
|
||||||
#ifdef HAS_RGB24MIRRORROW_SSSE3
|
#ifdef HAS_RGB24MIRRORROW_SSSE3
|
||||||
|
|
||||||
@ -4813,6 +4823,60 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
|
|||||||
}
|
}
|
||||||
#endif // HAS_RGB24MIRRORROW_SSSE3
|
#endif // HAS_RGB24MIRRORROW_SSSE3
|
||||||
|
|
||||||
|
#ifdef HAS_RGB24MIRRORROW_AVX2
|
||||||
|
// Shuffle first 10 pixels to last 10 mirrored. first byte zero
|
||||||
|
static const uvec8 kShuffleMirrorRGB0_AVX = {
|
||||||
|
128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u, 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u
|
||||||
|
};
|
||||||
|
|
||||||
|
// Shuffle last 2 pixels to first 2 mirrored. last byte zero
|
||||||
|
static const uvec8 kShuffleMirrorRGB1_AVX = {
|
||||||
|
13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u
|
||||||
|
};
|
||||||
|
|
||||||
|
void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24,
|
||||||
|
uint8_t* dst_rgb24,
|
||||||
|
int width) {
|
||||||
|
ptrdiff_t temp_width = (ptrdiff_t)(width);
|
||||||
|
src_rgb24 += width * 3 - 96;
|
||||||
|
asm volatile(
|
||||||
|
"vbroadcasti128 %3,%%ymm4 \n"
|
||||||
|
"vmovdqa %4,%%xmm5 \n"
|
||||||
|
|
||||||
|
LABELALIGN
|
||||||
|
"1: \n"
|
||||||
|
"vmovdqu (%0),%%xmm0 \n" // first 10
|
||||||
|
"vinserti128 $1,15(%0),%%ymm0,%%ymm0 \n"
|
||||||
|
"vmovdqu 30(%0),%%xmm1 \n" // next 10
|
||||||
|
"vinserti128 $1,45(%0),%%ymm1,%%ymm1 \n"
|
||||||
|
"vmovdqu 60(%0),%%xmm2 \n" // next 10
|
||||||
|
"vinserti128 $1,75(%0),%%ymm2,%%ymm2 \n"
|
||||||
|
"vmovdqu 80(%0),%%xmm3 \n" // last 2 special
|
||||||
|
"vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
|
||||||
|
"vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
|
||||||
|
"vpshufb %%ymm4,%%ymm2,%%ymm2 \n"
|
||||||
|
"vpshufb %%xmm5,%%xmm3,%%xmm3 \n"
|
||||||
|
"lea -0x60(%0),%0 \n"
|
||||||
|
"vmovdqu %%xmm0,80(%1) \n"
|
||||||
|
"vextracti128 $1,%%ymm0,65(%1) \n"
|
||||||
|
"vmovdqu %%xmm1,50(%1) \n"
|
||||||
|
"vextracti128 $1,%%ymm1,35(%1) \n"
|
||||||
|
"vmovdqu %%xmm2,20(%1) \n"
|
||||||
|
"vextracti128 $1,%%ymm2,5(%1) \n"
|
||||||
|
"vmovq %%xmm3,0(%1) \n"
|
||||||
|
"lea 0x60(%1),%1 \n"
|
||||||
|
"sub $0x20,%2 \n"
|
||||||
|
"jg 1b \n"
|
||||||
|
"vzeroupper \n"
|
||||||
|
: "+r"(src_rgb24), // %0
|
||||||
|
"+r"(dst_rgb24), // %1
|
||||||
|
"+r"(temp_width) // %2
|
||||||
|
: "m"(kShuffleMirrorRGB0_AVX), // %3
|
||||||
|
"m"(kShuffleMirrorRGB1_AVX) // %4
|
||||||
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
||||||
|
}
|
||||||
|
#endif // HAS_RGB24MIRRORROW_AVX2
|
||||||
|
|
||||||
#ifdef HAS_ARGBMIRRORROW_SSE2
|
#ifdef HAS_ARGBMIRRORROW_SSE2
|
||||||
|
|
||||||
void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
|
void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
|
||||||
|
|||||||
@ -470,6 +470,128 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAS_MIRRORROW_AVX2
|
||||||
|
LIBYUV_TARGET_AVX2
|
||||||
|
void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||||
|
__m256i ymm_shuf =
|
||||||
|
_mm256_broadcastsi128_si256(_mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
|
||||||
|
src += width;
|
||||||
|
while (width > 0) {
|
||||||
|
src -= 32;
|
||||||
|
__m256i ymm0 = _mm256_loadu_si256((const __m256i*)src);
|
||||||
|
ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
|
||||||
|
ymm0 = _mm256_permute4x64_epi64(ymm0, 0x4e);
|
||||||
|
_mm256_storeu_si256((__m256i*)dst, ymm0);
|
||||||
|
dst += 32;
|
||||||
|
width -= 32;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAS_MIRRORUVROW_AVX2
|
||||||
|
LIBYUV_TARGET_AVX2
|
||||||
|
void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
|
||||||
|
__m256i ymm_shuf =
|
||||||
|
_mm256_broadcastsi128_si256(_mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
|
||||||
|
src_uv += width * 2;
|
||||||
|
while (width > 0) {
|
||||||
|
src_uv -= 32;
|
||||||
|
__m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_uv);
|
||||||
|
ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
|
||||||
|
ymm0 = _mm256_permute4x64_epi64(ymm0, 0x4e);
|
||||||
|
_mm256_storeu_si256((__m256i*)dst_uv, ymm0);
|
||||||
|
dst_uv += 32;
|
||||||
|
width -= 16;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAS_MIRRORSPLITUVROW_AVX2
|
||||||
|
LIBYUV_TARGET_AVX2
|
||||||
|
void MirrorSplitUVRow_AVX2(const uint8_t* src_uv,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width) {
|
||||||
|
__m256i ymm_shuf =
|
||||||
|
_mm256_broadcastsi128_si256(_mm_setr_epi8(14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1));
|
||||||
|
src_uv += width * 2;
|
||||||
|
while (width > 0) {
|
||||||
|
src_uv -= 32;
|
||||||
|
__m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_uv);
|
||||||
|
ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
|
||||||
|
ymm0 = _mm256_permute4x64_epi64(ymm0, 0x72);
|
||||||
|
_mm_storeu_si128((__m128i*)dst_u, _mm256_castsi256_si128(ymm0));
|
||||||
|
_mm_storeu_si128((__m128i*)dst_v, _mm256_extracti128_si256(ymm0, 1));
|
||||||
|
dst_u += 16;
|
||||||
|
dst_v += 16;
|
||||||
|
width -= 16;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAS_RGB24MIRRORROW_AVX2
|
||||||
|
LIBYUV_TARGET_AVX2
|
||||||
|
void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24,
|
||||||
|
uint8_t* dst_rgb24,
|
||||||
|
int width) {
|
||||||
|
__m256i shuf0 = _mm256_setr_epi8(
|
||||||
|
-1, 12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2,
|
||||||
|
-1, 12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2);
|
||||||
|
__m128i shuf1 = _mm_setr_epi8(
|
||||||
|
13, 14, 15, 10, 11, 12, 7, 8, 9, 4, 5, 6, 1, 2, 3, -1);
|
||||||
|
|
||||||
|
src_rgb24 += width * 3 - 96;
|
||||||
|
while (width > 0) {
|
||||||
|
__m128i v0_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 0));
|
||||||
|
__m128i v0_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 15));
|
||||||
|
__m256i v0 = _mm256_inserti128_si256(_mm256_castsi128_si256(v0_lo), v0_hi, 1);
|
||||||
|
|
||||||
|
__m128i v1_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 30));
|
||||||
|
__m128i v1_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 45));
|
||||||
|
__m256i v1 = _mm256_inserti128_si256(_mm256_castsi128_si256(v1_lo), v1_hi, 1);
|
||||||
|
|
||||||
|
__m128i v2_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 60));
|
||||||
|
__m128i v2_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 75));
|
||||||
|
__m256i v2 = _mm256_inserti128_si256(_mm256_castsi128_si256(v2_lo), v2_hi, 1);
|
||||||
|
|
||||||
|
__m128i v3 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 80));
|
||||||
|
|
||||||
|
v0 = _mm256_shuffle_epi8(v0, shuf0);
|
||||||
|
v1 = _mm256_shuffle_epi8(v1, shuf0);
|
||||||
|
v2 = _mm256_shuffle_epi8(v2, shuf0);
|
||||||
|
v3 = _mm_shuffle_epi8(v3, shuf1);
|
||||||
|
|
||||||
|
_mm_storeu_si128((__m128i*)(dst_rgb24 + 80), _mm256_castsi256_si128(v0));
|
||||||
|
_mm_storeu_si128((__m128i*)(dst_rgb24 + 65), _mm256_extracti128_si256(v0, 1));
|
||||||
|
_mm_storeu_si128((__m128i*)(dst_rgb24 + 50), _mm256_castsi256_si128(v1));
|
||||||
|
_mm_storeu_si128((__m128i*)(dst_rgb24 + 35), _mm256_extracti128_si256(v1, 1));
|
||||||
|
_mm_storeu_si128((__m128i*)(dst_rgb24 + 20), _mm256_castsi256_si128(v2));
|
||||||
|
_mm_storeu_si128((__m128i*)(dst_rgb24 + 5), _mm256_extracti128_si256(v2, 1));
|
||||||
|
_mm_storel_epi64((__m128i*)(dst_rgb24 + 0), v3);
|
||||||
|
|
||||||
|
src_rgb24 -= 96;
|
||||||
|
dst_rgb24 += 96;
|
||||||
|
width -= 32;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAS_ARGBMIRRORROW_AVX2
|
||||||
|
LIBYUV_TARGET_AVX2
|
||||||
|
void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||||
|
__m256i ymm_shuf = _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
||||||
|
src += width * 4;
|
||||||
|
while (width > 0) {
|
||||||
|
src -= 32;
|
||||||
|
__m256i ymm0 = _mm256_loadu_si256((const __m256i*)src);
|
||||||
|
ymm0 = _mm256_permutevar8x32_epi32(ymm0, ymm_shuf);
|
||||||
|
_mm256_storeu_si256((__m256i*)dst, ymm0);
|
||||||
|
dst += 32;
|
||||||
|
width -= 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|||||||
@ -22,14 +22,8 @@ namespace libyuv {
|
|||||||
|
|
||||||
// TODO(fbarchard): clang x86 has a higher accuracy YUV to RGB.
|
// TODO(fbarchard): clang x86 has a higher accuracy YUV to RGB.
|
||||||
// Port to Visual C and other CPUs
|
// Port to Visual C and other CPUs
|
||||||
#if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && \
|
|
||||||
(defined(__x86_64__) || defined(__i386__))
|
|
||||||
#define ERROR_FULL 5
|
|
||||||
#define ERROR_J420 4
|
|
||||||
#else
|
|
||||||
#define ERROR_FULL 6
|
#define ERROR_FULL 6
|
||||||
#define ERROR_J420 6
|
#define ERROR_J420 6
|
||||||
#endif
|
|
||||||
#define ERROR_R 1
|
#define ERROR_R 1
|
||||||
#define ERROR_G 1
|
#define ERROR_G 1
|
||||||
#ifdef LIBYUV_UNLIMITED_DATA
|
#ifdef LIBYUV_UNLIMITED_DATA
|
||||||
|
|||||||
@ -2050,7 +2050,6 @@ TEST_F(LibYUVConvertTest, MM21ToYUY2) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Test RGB24 to J420 is exact
|
// Test RGB24 to J420 is exact
|
||||||
#if defined(LIBYUV_BIT_EXACT)
|
|
||||||
TEST_F(LibYUVConvertTest, TestRGB24ToJ420) {
|
TEST_F(LibYUVConvertTest, TestRGB24ToJ420) {
|
||||||
const int kSize = 256;
|
const int kSize = 256;
|
||||||
align_buffer_page_end(orig_rgb24, kSize * 3 * 2); // 2 rows of RGB24
|
align_buffer_page_end(orig_rgb24, kSize * 3 * 2); // 2 rows of RGB24
|
||||||
@ -2075,10 +2074,8 @@ TEST_F(LibYUVConvertTest, TestRGB24ToJ420) {
|
|||||||
free_aligned_buffer_page_end(orig_rgb24);
|
free_aligned_buffer_page_end(orig_rgb24);
|
||||||
free_aligned_buffer_page_end(dest_j420);
|
free_aligned_buffer_page_end(dest_j420);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
// Test RGB24 to I420 is exact
|
// Test RGB24 to I420 is exact
|
||||||
#if defined(LIBYUV_BIT_EXACT)
|
|
||||||
TEST_F(LibYUVConvertTest, TestRGB24ToI420) {
|
TEST_F(LibYUVConvertTest, TestRGB24ToI420) {
|
||||||
const int kSize = 256;
|
const int kSize = 256;
|
||||||
align_buffer_page_end(orig_rgb24, kSize * 3 * 2); // 2 rows of RGB24
|
align_buffer_page_end(orig_rgb24, kSize * 3 * 2); // 2 rows of RGB24
|
||||||
@ -2103,7 +2100,6 @@ TEST_F(LibYUVConvertTest, TestRGB24ToI420) {
|
|||||||
free_aligned_buffer_page_end(orig_rgb24);
|
free_aligned_buffer_page_end(orig_rgb24);
|
||||||
free_aligned_buffer_page_end(dest_i420);
|
free_aligned_buffer_page_end(dest_i420);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
TEST_F(LibYUVConvertTest, TestJ420ToI420) {
|
TEST_F(LibYUVConvertTest, TestJ420ToI420) {
|
||||||
const uint8_t src_y[12] = {0, 0, 128, 128, 255, 255,
|
const uint8_t src_y[12] = {0, 0, 128, 128, 255, 255,
|
||||||
|
|||||||
@ -29,11 +29,7 @@
|
|||||||
#include "libyuv/row.h" /* For ScaleSumSamples_Neon */
|
#include "libyuv/row.h" /* For ScaleSumSamples_Neon */
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(LIBYUV_BIT_EXACT)
|
|
||||||
#define EXPECTED_UNATTENUATE_DIFF 0
|
#define EXPECTED_UNATTENUATE_DIFF 0
|
||||||
#else
|
|
||||||
#define EXPECTED_UNATTENUATE_DIFF 2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace libyuv {
|
namespace libyuv {
|
||||||
|
|
||||||
@ -284,28 +280,28 @@ TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) {
|
|||||||
int max_diff = TestUnattenuateI(benchmark_width_ + 1, benchmark_height_,
|
int max_diff = TestUnattenuateI(benchmark_width_ + 1, benchmark_height_,
|
||||||
benchmark_iterations_, disable_cpu_flags_,
|
benchmark_iterations_, disable_cpu_flags_,
|
||||||
benchmark_cpu_info_, +1, 0);
|
benchmark_cpu_info_, +1, 0);
|
||||||
ASSERT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
|
ASSERT_EQ(max_diff, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) {
|
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) {
|
||||||
int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
|
int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
|
||||||
benchmark_iterations_, disable_cpu_flags_,
|
benchmark_iterations_, disable_cpu_flags_,
|
||||||
benchmark_cpu_info_, +1, 1);
|
benchmark_cpu_info_, +1, 1);
|
||||||
ASSERT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
|
ASSERT_EQ(max_diff, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) {
|
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) {
|
||||||
int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
|
int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
|
||||||
benchmark_iterations_, disable_cpu_flags_,
|
benchmark_iterations_, disable_cpu_flags_,
|
||||||
benchmark_cpu_info_, -1, 0);
|
benchmark_cpu_info_, -1, 0);
|
||||||
ASSERT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
|
ASSERT_EQ(max_diff, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) {
|
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) {
|
||||||
int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
|
int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
|
||||||
benchmark_iterations_, disable_cpu_flags_,
|
benchmark_iterations_, disable_cpu_flags_,
|
||||||
benchmark_cpu_info_, +1, 0);
|
benchmark_cpu_info_, +1, 0);
|
||||||
ASSERT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
|
ASSERT_EQ(max_diff, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
|
TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user