mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-06-15 00:16:08 +08:00
YUV to RGB avoid avx assist
Here are the functions flagged for mixing both SSE and AVX (or AVX-512) instructions, which can trigger an AVX transition/assist performance penalty: Libyuv Functions addressed in this CL * I422ToARGBRow_AVX512BW * HalfFloatRow_SSE2 Not addressed: * ScaleFilterCols_SSSE3 Bug: libyuv:509681367 Change-Id: I8ced6065dfe0c516d05857086393782c8590062a Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7814945 Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
parent
5a17753597
commit
561a9780e2
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: https://chromium.googlesource.com/libyuv/libyuv/
|
||||
Version: 1934
|
||||
Version: 1935
|
||||
Revision: DEPS
|
||||
License: BSD-3-Clause
|
||||
License File: LICENSE
|
||||
|
||||
@ -50,7 +50,6 @@ extern "C" {
|
||||
#define HAS_COPYROW_ERMS
|
||||
#define HAS_COPYROW_SSE2
|
||||
#define HAS_H422TOARGBROW_SSSE3
|
||||
#define HAS_HALFFLOATROW_SSE2
|
||||
#define HAS_I422TOARGB1555ROW_SSSE3
|
||||
#define HAS_I422TOARGB4444ROW_SSSE3
|
||||
#define HAS_I422TOARGBROW_SSSE3
|
||||
@ -6863,14 +6862,6 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
|
||||
|
||||
// Scale and convert to half float.
|
||||
void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width);
|
||||
void HalfFloatRow_SSE2(const uint16_t* src,
|
||||
uint16_t* dst,
|
||||
float scale,
|
||||
int width);
|
||||
void HalfFloatRow_Any_SSE2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
float param,
|
||||
int width);
|
||||
void HalfFloatRow_AVX2(const uint16_t* src,
|
||||
uint16_t* dst,
|
||||
float scale,
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1934
|
||||
#define LIBYUV_VERSION 1935
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -5077,14 +5077,6 @@ int HalfFloatPlane(const uint16_t* src_y,
|
||||
height = 1;
|
||||
src_stride_y = dst_stride_y = 0;
|
||||
}
|
||||
#if defined(HAS_HALFFLOATROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
HalfFloatRow = HalfFloatRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
HalfFloatRow = HalfFloatRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HALFFLOATROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
HalfFloatRow = HalfFloatRow_Any_AVX2;
|
||||
|
||||
@ -1748,9 +1748,6 @@ ANY11SB(Convert8To8Row_Any_AVX2,
|
||||
memcpy(dst_ptr + np, vout, r * BPP); \
|
||||
}
|
||||
|
||||
#ifdef HAS_HALFFLOATROW_SSE2
|
||||
ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7)
|
||||
#endif
|
||||
#ifdef HAS_HALFFLOATROW_AVX2
|
||||
ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15)
|
||||
#endif
|
||||
|
||||
@ -224,8 +224,8 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
|
||||
asm volatile(
|
||||
"vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 0xff000000
|
||||
"vpslld $0x18,%%ymm6,%%ymm6 \n"
|
||||
"vbroadcastf128 %3,%%ymm4 \n" //
|
||||
"vbroadcastf128 %4,%%ymm5 \n" //
|
||||
"vbroadcasti128 %3,%%ymm4 \n" //
|
||||
"vbroadcasti128 %4,%%ymm5 \n" //
|
||||
|
||||
LABELALIGN //
|
||||
"1: \n"
|
||||
@ -609,7 +609,7 @@ static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
|
||||
|
||||
void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||
asm volatile(
|
||||
"vbroadcastf128 %3,%%ymm6 \n"
|
||||
"vbroadcasti128 %3,%%ymm6 \n"
|
||||
"vmovdqa %4,%%ymm7 \n"
|
||||
|
||||
LABELALIGN
|
||||
@ -703,7 +703,7 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
|
||||
#ifdef HAS_ARGBTORAWROW_AVX2
|
||||
void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||
asm volatile(
|
||||
"vbroadcastf128 %3,%%ymm6 \n"
|
||||
"vbroadcasti128 %3,%%ymm6 \n"
|
||||
"vmovdqa %4,%%ymm7 \n"
|
||||
|
||||
LABELALIGN
|
||||
@ -1061,7 +1061,7 @@ void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
|
||||
#ifdef HAS_ARGBTOAR30ROW_AVX2
|
||||
void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||
asm volatile(
|
||||
"vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
|
||||
"vbroadcasti128 %3,%%ymm2 \n" // shuffler for RB
|
||||
"vbroadcastss %4,%%ymm3 \n" // multipler for RB
|
||||
"vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
|
||||
"vbroadcastss %6,%%ymm5 \n" // mask for AG
|
||||
@ -1098,7 +1098,7 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||
#ifdef HAS_ABGRTOAR30ROW_AVX2
|
||||
void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||
asm volatile(
|
||||
"vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
|
||||
"vbroadcasti128 %3,%%ymm2 \n" // shuffler for RB
|
||||
"vbroadcastss %4,%%ymm3 \n" // multipler for RB
|
||||
"vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
|
||||
"vbroadcastss %6,%%ymm5 \n" // mask for AG
|
||||
@ -1264,8 +1264,8 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
|
||||
uint16_t* dst_ab64,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"vbroadcastf128 %3,%%ymm2 \n"
|
||||
"vbroadcastf128 %4,%%ymm3 \n" LABELALIGN
|
||||
"vbroadcasti128 %3,%%ymm2 \n"
|
||||
"vbroadcasti128 %4,%%ymm3 \n" LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm0 \n"
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
@ -1317,7 +1317,7 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
|
||||
void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
|
||||
uint8_t* dst_argb,
|
||||
int width) {
|
||||
asm volatile("vbroadcastf128 %3,%%ymm2 \n" LABELALIGN
|
||||
asm volatile("vbroadcasti128 %3,%%ymm2 \n" LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm0 \n"
|
||||
"vmovdqu 0x20(%0),%%ymm1 \n"
|
||||
@ -1532,8 +1532,8 @@ void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
|
||||
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
"vpsllw $15,%%ymm5,%%ymm5 \n"
|
||||
"vpacksswb %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
"vbroadcastf128 0(%3),%%ymm4 \n"
|
||||
"vbroadcastf128 0x60(%3),%%ymm7 \n"
|
||||
"vbroadcasti128 0(%3),%%ymm4 \n"
|
||||
"vbroadcasti128 0x60(%3),%%ymm7 \n"
|
||||
"vpmaddubsw %%ymm5,%%ymm4,%%ymm6 \n"
|
||||
"vphaddw %%ymm6,%%ymm6,%%ymm6 \n"
|
||||
"vpsubw %%ymm6,%%ymm7,%%ymm7 \n"
|
||||
@ -1933,8 +1933,8 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"vbroadcastf128 0x20(%5),%%ymm4 \n" // RGBToU
|
||||
"vbroadcastf128 0x40(%5),%%ymm5 \n" // RGBToV
|
||||
"vbroadcasti128 0x20(%5),%%ymm4 \n" // RGBToU
|
||||
"vbroadcasti128 0x40(%5),%%ymm5 \n" // RGBToV
|
||||
"vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 0x0101
|
||||
"vpabsb %%ymm6,%%ymm6 \n"
|
||||
"vmovdqa %6,%%ymm7 \n" // kShuffleAARRGGBB
|
||||
@ -3421,7 +3421,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
|
||||
"vpunpcklbw %%zmm1,%%zmm3,%%zmm3 \n" \
|
||||
"vpermq $0xd8,%%zmm3,%%zmm3 \n" \
|
||||
"vpunpcklwd %%zmm3,%%zmm3,%%zmm3 \n" \
|
||||
"vmovups (%[y_buf]),%%ymm4 \n" \
|
||||
"vmovdqu (%[y_buf]),%%ymm4 \n" \
|
||||
"vpermq %%zmm4,%%zmm17,%%zmm4 \n" \
|
||||
"vpermq $0xd8,%%zmm4,%%zmm4 \n" \
|
||||
"vpunpcklbw %%zmm4,%%zmm4,%%zmm4 \n" \
|
||||
@ -3614,32 +3614,26 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
|
||||
"vpshufb %%ymm7,%%ymm1,%%ymm3 \n" \
|
||||
"lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n"
|
||||
|
||||
// TODO(fbarchard): Remove broadcastb
|
||||
#if defined(__x86_64__)
|
||||
#define YUVTORGB_SETUP_AVX2(yuvconstants) \
|
||||
"vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \
|
||||
"vpcmpeqb %%ymm13,%%ymm13,%%ymm13 \n" \
|
||||
"vmovdqa (%[yuvconstants]),%%ymm8 \n" \
|
||||
"vpsllw $7,%%xmm13,%%xmm13 \n" \
|
||||
"vpabsb %%ymm13,%%ymm13 \n" \
|
||||
"vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \
|
||||
"vpbroadcastb %%xmm13,%%ymm13 \n" \
|
||||
"vpsllw $7,%%ymm13,%%ymm13 \n" \
|
||||
"vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \
|
||||
"vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
|
||||
"vmovdqa 128(%[yuvconstants]),%%ymm12 \n"
|
||||
|
||||
#define YUVTORGB_SETUP_AVX512BW(yuvconstants) \
|
||||
"vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \
|
||||
"movdqa (%[yuvconstants]),%%xmm8 \n" \
|
||||
"vpbroadcastq %%xmm8, %%zmm8 \n" \
|
||||
"vpsllw $7,%%xmm13,%%xmm13 \n" \
|
||||
"vpbroadcastb %%xmm13,%%zmm13 \n" \
|
||||
"movq 32(%[yuvconstants]),%%xmm9 \n" \
|
||||
"vpbroadcastq %%xmm9,%%zmm9 \n" \
|
||||
"movq 64(%[yuvconstants]),%%xmm10 \n" \
|
||||
"vpbroadcastq %%xmm10,%%zmm10 \n" \
|
||||
"movq 96(%[yuvconstants]),%%xmm11 \n" \
|
||||
"vpbroadcastq %%xmm11,%%zmm11 \n" \
|
||||
"movq 128(%[yuvconstants]),%%xmm12 \n" \
|
||||
"vpbroadcastq %%xmm12,%%zmm12 \n" \
|
||||
"vpternlogd $0xff,%%zmm13,%%zmm13,%%zmm13 \n" \
|
||||
"vpbroadcastq (%[yuvconstants]),%%zmm8 \n" \
|
||||
"vpabsb %%zmm13,%%zmm13 \n" \
|
||||
"vpsllw $7,%%zmm13,%%zmm13 \n" \
|
||||
"vpbroadcastq 32(%[yuvconstants]),%%zmm9 \n" \
|
||||
"vpbroadcastq 64(%[yuvconstants]),%%zmm10 \n" \
|
||||
"vpbroadcastq 96(%[yuvconstants]),%%zmm11 \n" \
|
||||
"vpbroadcastq 128(%[yuvconstants]),%%zmm12 \n" \
|
||||
"vmovups (%[quadsplitperm]),%%zmm16 \n" \
|
||||
"vmovups (%[dquadsplitperm]),%%zmm17 \n" \
|
||||
"vmovups (%[unperm]),%%zmm18 \n"
|
||||
@ -4384,8 +4378,8 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"vbroadcastf128 %[kShuffleYUY2Y],%%ymm6 \n"
|
||||
"vbroadcastf128 %[kShuffleYUY2UV],%%ymm7 \n" YUVTORGB_SETUP_AVX2(
|
||||
"vbroadcasti128 %[kShuffleYUY2Y],%%ymm6 \n"
|
||||
"vbroadcasti128 %[kShuffleYUY2UV],%%ymm7 \n" YUVTORGB_SETUP_AVX2(
|
||||
yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
|
||||
LABELALIGN "1: \n" READYUY2_AVX2 YUVTORGB_AVX2(yuvconstants)
|
||||
@ -4411,8 +4405,8 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"vbroadcastf128 %[kShuffleUYVYY],%%ymm6 \n"
|
||||
"vbroadcastf128 %[kShuffleUYVYUV],%%ymm7 \n" YUVTORGB_SETUP_AVX2(
|
||||
"vbroadcasti128 %[kShuffleUYVYY],%%ymm6 \n"
|
||||
"vbroadcasti128 %[kShuffleUYVYUV],%%ymm7 \n" YUVTORGB_SETUP_AVX2(
|
||||
yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
|
||||
LABELALIGN "1: \n" READUYVY_AVX2 YUVTORGB_AVX2(yuvconstants)
|
||||
@ -4671,7 +4665,7 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
|
||||
#ifdef HAS_MIRRORROW_AVX2
|
||||
void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||
ptrdiff_t temp_width = (ptrdiff_t)(width);
|
||||
asm volatile("vbroadcastf128 %3,%%ymm5 \n"
|
||||
asm volatile("vbroadcasti128 %3,%%ymm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
@ -4719,7 +4713,7 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
|
||||
#ifdef HAS_MIRRORUVROW_AVX2
|
||||
void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
|
||||
ptrdiff_t temp_width = (ptrdiff_t)(width);
|
||||
asm volatile("vbroadcastf128 %3,%%ymm5 \n"
|
||||
asm volatile("vbroadcasti128 %3,%%ymm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
@ -5207,7 +5201,7 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv,
|
||||
depth = 16 - depth;
|
||||
asm volatile(
|
||||
"vmovd %4,%%xmm3 \n"
|
||||
"vbroadcastf128 %5,%%ymm4 \n"
|
||||
"vbroadcasti128 %5,%%ymm4 \n"
|
||||
"sub %1,%2 \n"
|
||||
|
||||
// 16 pixels per loop.
|
||||
@ -6109,7 +6103,7 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
|
||||
"sub %1,%3 \n"
|
||||
"sub %1,%4 \n"
|
||||
"vmovdqa %7,%%ymm3 \n"
|
||||
"vbroadcastf128 %6,%%ymm4 \n"
|
||||
"vbroadcasti128 %6,%%ymm4 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
@ -6157,7 +6151,7 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"vmovdqa %6,%%ymm3 \n"
|
||||
"vbroadcastf128 %5,%%ymm4 \n"
|
||||
"vbroadcasti128 %5,%%ymm4 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
@ -6406,7 +6400,7 @@ void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
|
||||
"sub %0,%1 \n"
|
||||
"sub %0,%2 \n"
|
||||
"sub %0,%3 \n"
|
||||
"vbroadcastf128 %7,%%ymm5 \n"
|
||||
"vbroadcasti128 %7,%%ymm5 \n"
|
||||
"vmovd %6,%%xmm6 \n"
|
||||
|
||||
LABELALIGN
|
||||
@ -6461,7 +6455,7 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
|
||||
asm volatile(
|
||||
"sub %0,%1 \n"
|
||||
"sub %0,%2 \n"
|
||||
"vbroadcastf128 %6,%%ymm5 \n"
|
||||
"vbroadcasti128 %6,%%ymm5 \n"
|
||||
"vmovd %5,%%xmm6 \n"
|
||||
"vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
|
||||
"vpsrlw $8,%%ymm3,%%ymm3 \n" // A (0xff)
|
||||
@ -6694,7 +6688,7 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"vmovdqa %3,%%ymm4 \n"
|
||||
"vbroadcastf128 %4,%%ymm5 \n"
|
||||
"vbroadcasti128 %4,%%ymm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
@ -7640,7 +7634,7 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
|
||||
uintptr_t alpha;
|
||||
asm volatile(
|
||||
"sub %0,%1 \n"
|
||||
"vbroadcastf128 %5,%%ymm5 \n"
|
||||
"vbroadcasti128 %5,%%ymm5 \n"
|
||||
|
||||
// 8 pixel loop.
|
||||
LABELALIGN
|
||||
@ -8887,7 +8881,7 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb,
|
||||
const uint8_t* shuffler,
|
||||
int width) {
|
||||
asm volatile("vbroadcastf128 (%3),%%ymm5 \n"
|
||||
asm volatile("vbroadcasti128 (%3),%%ymm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
@ -9113,10 +9107,10 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
|
||||
const float* poly,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"vbroadcastf128 (%3),%%ymm4 \n"
|
||||
"vbroadcastf128 0x10(%3),%%ymm5 \n"
|
||||
"vbroadcastf128 0x20(%3),%%ymm6 \n"
|
||||
"vbroadcastf128 0x30(%3),%%ymm7 \n"
|
||||
"vbroadcasti128 (%3),%%ymm4 \n"
|
||||
"vbroadcasti128 0x10(%3),%%ymm5 \n"
|
||||
"vbroadcasti128 0x20(%3),%%ymm6 \n"
|
||||
"vbroadcasti128 0x30(%3),%%ymm7 \n"
|
||||
|
||||
// 2 pixel loop.
|
||||
LABELALIGN
|
||||
@ -9148,46 +9142,8 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
|
||||
}
|
||||
#endif // HAS_ARGBPOLYNOMIALROW_AVX2
|
||||
|
||||
#ifdef HAS_HALFFLOATROW_SSE2
|
||||
static float kScaleBias = 1.9259299444e-34f;
|
||||
void HalfFloatRow_SSE2(const uint16_t* src,
|
||||
uint16_t* dst,
|
||||
float scale,
|
||||
int width) {
|
||||
scale *= kScaleBias;
|
||||
asm volatile(
|
||||
"movd %3,%%xmm4 \n"
|
||||
"pshufd $0x0,%%xmm4,%%xmm4 \n"
|
||||
"pxor %%xmm5,%%xmm5 \n"
|
||||
"sub %0,%1 \n"
|
||||
|
||||
// 16 pixel loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm2 \n" // 8 shorts
|
||||
"add $0x10,%0 \n"
|
||||
"movdqa %%xmm2,%%xmm3 \n"
|
||||
"punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
|
||||
"cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
|
||||
"punpckhwd %%xmm5,%%xmm3 \n"
|
||||
"cvtdq2ps %%xmm3,%%xmm3 \n"
|
||||
"mulps %%xmm4,%%xmm2 \n"
|
||||
"mulps %%xmm4,%%xmm3 \n"
|
||||
"psrld $0xd,%%xmm2 \n"
|
||||
"psrld $0xd,%%xmm3 \n"
|
||||
"packssdw %%xmm3,%%xmm2 \n"
|
||||
"movdqu %%xmm2,-0x10(%0,%1,1) \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(scale) // %3
|
||||
: "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
|
||||
}
|
||||
#endif // HAS_HALFFLOATROW_SSE2
|
||||
|
||||
#ifdef HAS_HALFFLOATROW_AVX2
|
||||
static float kScaleBias = 1.9259299444e-34f;
|
||||
void HalfFloatRow_AVX2(const uint16_t* src,
|
||||
uint16_t* dst,
|
||||
float scale,
|
||||
@ -9510,9 +9466,9 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"sub %0,%1 \n"
|
||||
"vbroadcastf128 (%4),%%ymm4 \n" // 3 shuffler constants
|
||||
"vbroadcastf128 16(%4),%%ymm5 \n"
|
||||
"vbroadcastf128 32(%4),%%ymm6 \n"
|
||||
"vbroadcasti128 (%4),%%ymm4 \n" // 3 shuffler constants
|
||||
"vbroadcasti128 16(%4),%%ymm5 \n"
|
||||
"vbroadcasti128 32(%4),%%ymm6 \n"
|
||||
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm2 \n" // load 32 Y values
|
||||
@ -9619,7 +9575,7 @@ void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||
|
||||
#ifdef HAS_SWAPUVROW_AVX2
|
||||
void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||
asm volatile("vbroadcastf128 %3,%%ymm5 \n"
|
||||
asm volatile("vbroadcasti128 %3,%%ymm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user