mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
MergeUV AVX512BW use assembly
- Convert MergeUVRow_AVX512BW to assembly - Enable MergeUVRow_AVX512BW for Windows with clangcl - MergeUVRow_AVX2 use vpmovzxbw and vpsllw - MergeUVRow_16_AVX2 use vpmovzxbw and vpsllw with different shift for U and V AMD Zen 4 640x360 100000 iterations Was AVX512 MergeUVPlane_Opt (884 ms) AVX2 MergeUVPlane_Opt (945 ms) AVX2 MergeUVPlane_16_Opt (2167 ms) Now AVX512 MergeUVPlane_Opt (865 ms) AVX2 MergeUVPlane_Opt (943 ms) SSE2 MergeUVPlane_Opt (973 ms) AVX2 MergeUVPlane_16_Opt (2102 ms) Bug: None Change-Id: I658ada2a75d44c3f93be8bd3ed96f83d5fa2ab8d Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4271230 Reviewed-by: Fritz Koenig <frkoenig@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
parent
2bdc210be9
commit
88b050f337
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 1860
|
Version: 1861
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -402,9 +402,8 @@ extern "C" {
|
|||||||
// The following are available for AVX512 clang x86 platforms:
|
// The following are available for AVX512 clang x86 platforms:
|
||||||
// TODO(fbarchard): Port to GCC and Visual C
|
// TODO(fbarchard): Port to GCC and Visual C
|
||||||
// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
|
// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
|
||||||
// TODO(fbarchard): Port MERGEUV to assembly
|
|
||||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||||
(defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512) && !defined(_MSC_VER))
|
(defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512)
|
||||||
#define HAS_ARGBTORGB24ROW_AVX512VBMI
|
#define HAS_ARGBTORGB24ROW_AVX512VBMI
|
||||||
#define HAS_MERGEUVROW_AVX512BW
|
#define HAS_MERGEUVROW_AVX512BW
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1860
|
#define LIBYUV_VERSION 1861
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|||||||
@ -919,7 +919,7 @@ int I422ToNV21(const uint8_t* src_y,
|
|||||||
#if defined(HAS_MERGEUVROW_AVX2)
|
#if defined(HAS_MERGEUVROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||||
MergeUVRow = MergeUVRow_Any_AVX2;
|
MergeUVRow = MergeUVRow_Any_AVX2;
|
||||||
if (IS_ALIGNED(halfwidth, 32)) {
|
if (IS_ALIGNED(halfwidth, 16)) {
|
||||||
MergeUVRow = MergeUVRow_AVX2;
|
MergeUVRow = MergeUVRow_AVX2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -384,7 +384,7 @@ int ARGBToNV12(const uint8_t* src_argb,
|
|||||||
#if defined(HAS_MERGEUVROW_AVX2)
|
#if defined(HAS_MERGEUVROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||||
MergeUVRow_ = MergeUVRow_Any_AVX2;
|
MergeUVRow_ = MergeUVRow_Any_AVX2;
|
||||||
if (IS_ALIGNED(halfwidth, 32)) {
|
if (IS_ALIGNED(halfwidth, 16)) {
|
||||||
MergeUVRow_ = MergeUVRow_AVX2;
|
MergeUVRow_ = MergeUVRow_AVX2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -562,7 +562,7 @@ int ARGBToNV21(const uint8_t* src_argb,
|
|||||||
#if defined(HAS_MERGEUVROW_AVX2)
|
#if defined(HAS_MERGEUVROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||||
MergeUVRow_ = MergeUVRow_Any_AVX2;
|
MergeUVRow_ = MergeUVRow_Any_AVX2;
|
||||||
if (IS_ALIGNED(halfwidth, 32)) {
|
if (IS_ALIGNED(halfwidth, 16)) {
|
||||||
MergeUVRow_ = MergeUVRow_AVX2;
|
MergeUVRow_ = MergeUVRow_AVX2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -737,7 +737,7 @@ int ABGRToNV12(const uint8_t* src_abgr,
|
|||||||
#if defined(HAS_MERGEUVROW_AVX2)
|
#if defined(HAS_MERGEUVROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||||
MergeUVRow_ = MergeUVRow_Any_AVX2;
|
MergeUVRow_ = MergeUVRow_Any_AVX2;
|
||||||
if (IS_ALIGNED(halfwidth, 32)) {
|
if (IS_ALIGNED(halfwidth, 16)) {
|
||||||
MergeUVRow_ = MergeUVRow_AVX2;
|
MergeUVRow_ = MergeUVRow_AVX2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -913,7 +913,7 @@ int ABGRToNV21(const uint8_t* src_abgr,
|
|||||||
#if defined(HAS_MERGEUVROW_AVX2)
|
#if defined(HAS_MERGEUVROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||||
MergeUVRow_ = MergeUVRow_Any_AVX2;
|
MergeUVRow_ = MergeUVRow_Any_AVX2;
|
||||||
if (IS_ALIGNED(halfwidth, 32)) {
|
if (IS_ALIGNED(halfwidth, 16)) {
|
||||||
MergeUVRow_ = MergeUVRow_AVX2;
|
MergeUVRow_ = MergeUVRow_AVX2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2948,7 +2948,7 @@ int RAWToJNV21(const uint8_t* src_raw,
|
|||||||
#if defined(HAS_MERGEUVROW_AVX2)
|
#if defined(HAS_MERGEUVROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||||
MergeUVRow_ = MergeUVRow_Any_AVX2;
|
MergeUVRow_ = MergeUVRow_Any_AVX2;
|
||||||
if (IS_ALIGNED(halfwidth, 32)) {
|
if (IS_ALIGNED(halfwidth, 16)) {
|
||||||
MergeUVRow_ = MergeUVRow_AVX2;
|
MergeUVRow_ = MergeUVRow_AVX2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -162,7 +162,7 @@ void Convert8To16Plane(const uint8_t* src_y,
|
|||||||
int src_stride_y,
|
int src_stride_y,
|
||||||
uint16_t* dst_y,
|
uint16_t* dst_y,
|
||||||
int dst_stride_y,
|
int dst_stride_y,
|
||||||
int scale, // 16384 for 10 bits
|
int scale, // 1024 for 10 bits
|
||||||
int width,
|
int width,
|
||||||
int height) {
|
int height) {
|
||||||
int y;
|
int y;
|
||||||
@ -594,7 +594,7 @@ void MergeUVPlane(const uint8_t* src_u,
|
|||||||
#if defined(HAS_MERGEUVROW_AVX2)
|
#if defined(HAS_MERGEUVROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||||
MergeUVRow = MergeUVRow_Any_AVX2;
|
MergeUVRow = MergeUVRow_Any_AVX2;
|
||||||
if (IS_ALIGNED(width, 32)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
MergeUVRow = MergeUVRow_AVX2;
|
MergeUVRow = MergeUVRow_AVX2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -736,7 +736,7 @@ void MergeUVPlane_16(const uint16_t* src_u,
|
|||||||
#if defined(HAS_MERGEUVROW_16_AVX2)
|
#if defined(HAS_MERGEUVROW_16_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||||
MergeUVRow_16 = MergeUVRow_16_Any_AVX2;
|
MergeUVRow_16 = MergeUVRow_16_Any_AVX2;
|
||||||
if (IS_ALIGNED(width, 16)) {
|
if (IS_ALIGNED(width, 8)) {
|
||||||
MergeUVRow_16 = MergeUVRow_16_AVX2;
|
MergeUVRow_16 = MergeUVRow_16_AVX2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -569,7 +569,7 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON,
|
|||||||
ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
|
ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_MERGEUVROW_AVX2
|
#ifdef HAS_MERGEUVROW_AVX2
|
||||||
ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
|
ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_MERGEUVROW_AVX512BW
|
#ifdef HAS_MERGEUVROW_AVX512BW
|
||||||
ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31)
|
ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31)
|
||||||
@ -861,7 +861,7 @@ ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAS_MERGEUVROW_16_AVX2
|
#ifdef HAS_MERGEUVROW_16_AVX2
|
||||||
ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 15)
|
ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 7)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_MERGEUVROW_16_NEON
|
#ifdef HAS_MERGEUVROW_16_NEON
|
||||||
ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7)
|
ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7)
|
||||||
|
|||||||
@ -17,8 +17,6 @@ extern "C" {
|
|||||||
// This module is for GCC x86 and x64.
|
// This module is for GCC x86 and x64.
|
||||||
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
|
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
|
||||||
|
|
||||||
#include <immintrin.h>
|
|
||||||
|
|
||||||
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
|
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
|
||||||
|
|
||||||
// Constants for ARGB
|
// Constants for ARGB
|
||||||
@ -5145,21 +5143,30 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
|
|||||||
#endif // HAS_DETILESPLITUVROW_SSSE3
|
#endif // HAS_DETILESPLITUVROW_SSSE3
|
||||||
|
|
||||||
#ifdef HAS_MERGEUVROW_AVX512BW
|
#ifdef HAS_MERGEUVROW_AVX512BW
|
||||||
__attribute__ ((target("avx512vl,avx512bw")))
|
|
||||||
void MergeUVRow_AVX512BW(const uint8_t* src_u,
|
void MergeUVRow_AVX512BW(const uint8_t* src_u,
|
||||||
const uint8_t* src_v,
|
const uint8_t* src_v,
|
||||||
uint8_t* dst_uv,
|
uint8_t* dst_uv,
|
||||||
int width) {
|
int width) {
|
||||||
do {
|
asm volatile("sub %0,%1 \n"
|
||||||
const __m512i u = _mm512_cvtepu8_epi16(_mm256_loadu_epi8(src_u));
|
|
||||||
const __m512i v = _mm512_slli_epi64(_mm512_cvtepu8_epi16(_mm256_loadu_epi8(src_v)), 8);
|
LABELALIGN
|
||||||
const __m512i uv = _mm512_or_si512(u, v);
|
"1: \n"
|
||||||
_mm512_storeu_epi8(dst_uv, uv);
|
"vpmovzxbw (%0),%%zmm0 \n"
|
||||||
src_u += 32;
|
"vpmovzxbw 0x00(%0,%1,1),%%zmm1 \n"
|
||||||
src_v += 32;
|
"lea 0x20(%0),%0 \n"
|
||||||
dst_uv += 64;
|
"vpsllw $0x8,%%zmm1,%%zmm1 \n"
|
||||||
width -= 32;
|
"vporq %%zmm0,%%zmm1,%%zmm2 \n"
|
||||||
} while (width > 0);
|
"vmovdqu64 %%zmm2,(%2) \n"
|
||||||
|
"lea 0x40(%2),%2 \n"
|
||||||
|
"sub $0x20,%3 \n"
|
||||||
|
"jg 1b \n"
|
||||||
|
"vzeroupper \n"
|
||||||
|
: "+r"(src_u), // %0
|
||||||
|
"+r"(src_v), // %1
|
||||||
|
"+r"(dst_uv), // %2
|
||||||
|
"+r"(width) // %3
|
||||||
|
:
|
||||||
|
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
||||||
}
|
}
|
||||||
#endif // HAS_MERGEUVROW_AVX512BW
|
#endif // HAS_MERGEUVROW_AVX512BW
|
||||||
|
|
||||||
@ -5168,23 +5175,18 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
|
|||||||
const uint8_t* src_v,
|
const uint8_t* src_v,
|
||||||
uint8_t* dst_uv,
|
uint8_t* dst_uv,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile("sub %0,%1 \n"
|
||||||
|
|
||||||
"sub %0,%1 \n"
|
|
||||||
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vmovdqu (%0),%%ymm0 \n"
|
"vpmovzxbw (%0),%%ymm0 \n"
|
||||||
"vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
|
"vpmovzxbw 0x00(%0,%1,1),%%ymm1 \n"
|
||||||
"lea 0x20(%0),%0 \n"
|
"lea 0x10(%0),%0 \n"
|
||||||
"vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
|
"vpsllw $0x8,%%ymm1,%%ymm1 \n"
|
||||||
"vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
|
"vpor %%ymm0,%%ymm1,%%ymm2 \n"
|
||||||
"vextractf128 $0x0,%%ymm2,(%2) \n"
|
"vmovdqu %%ymm2,(%2) \n"
|
||||||
"vextractf128 $0x0,%%ymm0,0x10(%2) \n"
|
"lea 0x20(%2),%2 \n"
|
||||||
"vextractf128 $0x1,%%ymm2,0x20(%2) \n"
|
"sub $0x10,%3 \n"
|
||||||
"vextractf128 $0x1,%%ymm0,0x30(%2) \n"
|
|
||||||
"lea 0x40(%2),%2 \n"
|
|
||||||
"sub $0x20,%3 \n"
|
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
"vzeroupper \n"
|
"vzeroupper \n"
|
||||||
: "+r"(src_u), // %0
|
: "+r"(src_u), // %0
|
||||||
@ -5201,9 +5203,7 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
|
|||||||
const uint8_t* src_v,
|
const uint8_t* src_v,
|
||||||
uint8_t* dst_uv,
|
uint8_t* dst_uv,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile("sub %0,%1 \n"
|
||||||
|
|
||||||
"sub %0,%1 \n"
|
|
||||||
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -5233,37 +5233,35 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
|
|||||||
uint16_t* dst_uv,
|
uint16_t* dst_uv,
|
||||||
int depth,
|
int depth,
|
||||||
int width) {
|
int width) {
|
||||||
depth = 16 - depth;
|
|
||||||
// clang-format off
|
// clang-format off
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"vmovd %4,%%xmm3 \n"
|
"vmovd %4,%%xmm3 \n"
|
||||||
"sub %0,%1 \n"
|
"vmovd %5,%%xmm4 \n"
|
||||||
|
|
||||||
|
|
||||||
|
"sub %0,%1 \n"
|
||||||
|
// 8 pixels per loop.
|
||||||
|
|
||||||
// 16 pixels per loop.
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vmovdqu (%0),%%ymm0 \n"
|
"vpmovzxwd (%0),%%ymm0 \n"
|
||||||
"vmovdqu (%0,%1,1),%%ymm1 \n"
|
"vpmovzxwd 0x00(%0,%1,1),%%ymm1 \n"
|
||||||
"add $0x20,%0 \n"
|
"lea 0x10(%0),%0 \n"
|
||||||
|
|
||||||
"vpsllw %%xmm3,%%ymm0,%%ymm0 \n"
|
"vpsllw %%xmm3,%%ymm0,%%ymm0 \n"
|
||||||
"vpsllw %%xmm3,%%ymm1,%%ymm1 \n"
|
"vpslld %%xmm4,%%ymm1,%%ymm1 \n"
|
||||||
"vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
|
"vpor %%ymm0,%%ymm1,%%ymm2 \n"
|
||||||
"vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
|
"vmovdqu %%ymm2,(%2) \n"
|
||||||
"vextractf128 $0x0,%%ymm2,(%2) \n"
|
"lea 0x20(%2),%2 \n"
|
||||||
"vextractf128 $0x0,%%ymm0,0x10(%2) \n"
|
"sub $0x8,%3 \n"
|
||||||
"vextractf128 $0x1,%%ymm2,0x20(%2) \n"
|
|
||||||
"vextractf128 $0x1,%%ymm0,0x30(%2) \n"
|
|
||||||
"add $0x40,%2 \n"
|
|
||||||
"sub $0x10,%3 \n"
|
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
"vzeroupper \n"
|
"vzeroupper \n"
|
||||||
: "+r"(src_u), // %0
|
: "+r"(src_u), // %0
|
||||||
"+r"(src_v), // %1
|
"+r"(src_v), // %1
|
||||||
"+r"(dst_uv), // %2
|
"+r"(dst_uv), // %2
|
||||||
"+r"(width) // %3
|
"+r"(width) // %3
|
||||||
: "r"(depth) // %4
|
: "r"(16 - depth), // %4
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
|
"r"(32 - depth) // %5
|
||||||
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
|
||||||
// clang-format on
|
// clang-format on
|
||||||
}
|
}
|
||||||
#endif // HAS_MERGEUVROW_AVX2
|
#endif // HAS_MERGEUVROW_AVX2
|
||||||
@ -5469,7 +5467,6 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
|
|||||||
// 512 = 9 bits
|
// 512 = 9 bits
|
||||||
// 1024 = 10 bits
|
// 1024 = 10 bits
|
||||||
// 4096 = 12 bits
|
// 4096 = 12 bits
|
||||||
// TODO(fbarchard): reduce to SSE2
|
|
||||||
void Convert8To16Row_SSE2(const uint8_t* src_y,
|
void Convert8To16Row_SSE2(const uint8_t* src_y,
|
||||||
uint16_t* dst_y,
|
uint16_t* dst_y,
|
||||||
int scale,
|
int scale,
|
||||||
|
|||||||
@ -820,28 +820,6 @@ void MergeUVRow_NEON(const uint8_t* src_u,
|
|||||||
: "cc", "memory", "v0", "v1" // Clobber List
|
: "cc", "memory", "v0", "v1" // Clobber List
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
// Reads 16 U's and V's and writes out 16 pairs of UV.
|
|
||||||
void MergeUVRow_NEON1(const uint8_t* src_u,
|
|
||||||
const uint8_t* src_v,
|
|
||||||
uint8_t* dst_uv,
|
|
||||||
int width) {
|
|
||||||
asm volatile(
|
|
||||||
"1: \n"
|
|
||||||
"ld1 {v0.16b,v2.16b}, [%0], #32 \n" // load U
|
|
||||||
"ld1 {v1.16b,v3.16b}, [%1], #32 \n" // load V
|
|
||||||
"subs %w3, %w3, #32 \n" // 32 processed per loop
|
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
|
||||||
"prfm pldl1keep, [%1, 448] \n"
|
|
||||||
"st2 {v0.16b,v1.16b,v2.16b,v3.16b}, [%2], #64 \n" // store 32 UV
|
|
||||||
"b.gt 1b \n"
|
|
||||||
: "+r"(src_u), // %0
|
|
||||||
"+r"(src_v), // %1
|
|
||||||
"+r"(dst_uv), // %2
|
|
||||||
"+r"(width) // %3 // Output registers
|
|
||||||
: // Input registers
|
|
||||||
: "cc", "memory", "v0", "v1" // Clobber List
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
void MergeUVRow_16_NEON(const uint16_t* src_u,
|
void MergeUVRow_16_NEON(const uint16_t* src_u,
|
||||||
const uint16_t* src_v,
|
const uint16_t* src_v,
|
||||||
|
|||||||
@ -3534,8 +3534,8 @@ TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 16)
|
|||||||
// TODO(fbarchard): improve test for platforms and cpu detect
|
// TODO(fbarchard): improve test for platforms and cpu detect
|
||||||
#ifdef HAS_MERGEUVROW_16_AVX2
|
#ifdef HAS_MERGEUVROW_16_AVX2
|
||||||
TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
|
TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
|
||||||
// Round count up to multiple of 16
|
// Round count up to multiple of 8
|
||||||
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
|
const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7;
|
||||||
|
|
||||||
align_buffer_page_end(src_pixels_u, kPixels * 2);
|
align_buffer_page_end(src_pixels_u, kPixels * 2);
|
||||||
align_buffer_page_end(src_pixels_v, kPixels * 2);
|
align_buffer_page_end(src_pixels_v, kPixels * 2);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user