mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
AVX version of Polynomial
BUG=265 TEST=untested R=thorcarpenter@google.com, yunqingwang@google.com Review URL: https://webrtc-codereview.appspot.com/2166004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@780 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
dcd87ffb8c
commit
6da76f3b34
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 779
|
Version: 780
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -142,8 +142,8 @@ extern "C" {
|
|||||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||||||
// Effects:
|
// Effects:
|
||||||
#define HAS_ARGBCOLORTABLEROW_X86
|
#define HAS_ARGBCOLORTABLEROW_X86
|
||||||
#define HAS_RGBCOLORTABLEROW_X86
|
|
||||||
#define HAS_ARGBPOLYNOMIALROW_SSE2
|
#define HAS_ARGBPOLYNOMIALROW_SSE2
|
||||||
|
#define HAS_RGBCOLORTABLEROW_X86
|
||||||
|
|
||||||
// Caveat: Visual C 2012 required for AVX2.
|
// Caveat: Visual C 2012 required for AVX2.
|
||||||
#if _MSC_VER >= 1700
|
#if _MSC_VER >= 1700
|
||||||
@ -153,6 +153,7 @@ extern "C" {
|
|||||||
#define HAS_ARGBTOYROW_AVX2
|
#define HAS_ARGBTOYROW_AVX2
|
||||||
#define HAS_HALFROW_AVX2
|
#define HAS_HALFROW_AVX2
|
||||||
#define HAS_I422TOARGBROW_AVX2
|
#define HAS_I422TOARGBROW_AVX2
|
||||||
|
#define HAS_INTERPOLATEROW_AVX2
|
||||||
#define HAS_MERGEUVROW_AVX2
|
#define HAS_MERGEUVROW_AVX2
|
||||||
#define HAS_MIRRORROW_AVX2
|
#define HAS_MIRRORROW_AVX2
|
||||||
#define HAS_SPLITUVROW_AVX2
|
#define HAS_SPLITUVROW_AVX2
|
||||||
@ -162,17 +163,17 @@ extern "C" {
|
|||||||
#define HAS_YUY2TOUV422ROW_AVX2
|
#define HAS_YUY2TOUV422ROW_AVX2
|
||||||
#define HAS_YUY2TOUVROW_AVX2
|
#define HAS_YUY2TOUVROW_AVX2
|
||||||
#define HAS_YUY2TOYROW_AVX2
|
#define HAS_YUY2TOYROW_AVX2
|
||||||
#define HAS_INTERPOLATEROW_AVX2
|
|
||||||
|
|
||||||
// Effects:
|
// Effects:
|
||||||
#define HAS_ARGBADDROW_AVX2
|
#define HAS_ARGBADDROW_AVX2
|
||||||
#define HAS_ARGBATTENUATEROW_AVX2
|
#define HAS_ARGBATTENUATEROW_AVX2
|
||||||
#define HAS_ARGBMIRRORROW_AVX2
|
#define HAS_ARGBMIRRORROW_AVX2
|
||||||
#define HAS_ARGBMULTIPLYROW_AVX2
|
#define HAS_ARGBMULTIPLYROW_AVX2
|
||||||
|
#define HAS_ARGBPOLYNOMIALROW_AVX2
|
||||||
#define HAS_ARGBSUBTRACTROW_AVX2
|
#define HAS_ARGBSUBTRACTROW_AVX2
|
||||||
#define HAS_ARGBUNATTENUATEROW_AVX2
|
#define HAS_ARGBUNATTENUATEROW_AVX2
|
||||||
#endif
|
#endif // _MSC_VER >= 1700
|
||||||
#endif
|
#endif // defined(_MSC_VER)
|
||||||
|
|
||||||
// The following are Yasm x86 only:
|
// The following are Yasm x86 only:
|
||||||
// TODO(fbarchard): Port AVX2 to inline.
|
// TODO(fbarchard): Port AVX2 to inline.
|
||||||
@ -1549,10 +1550,12 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
|
|||||||
void ARGBPolynomialRow_C(const uint8* src_argb,
|
void ARGBPolynomialRow_C(const uint8* src_argb,
|
||||||
uint8* dst_argb, const float* poly,
|
uint8* dst_argb, const float* poly,
|
||||||
int width);
|
int width);
|
||||||
|
|
||||||
void ARGBPolynomialRow_SSE2(const uint8* src_argb,
|
void ARGBPolynomialRow_SSE2(const uint8* src_argb,
|
||||||
uint8* dst_argb, const float* poly,
|
uint8* dst_argb, const float* poly,
|
||||||
int width);
|
int width);
|
||||||
|
void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
||||||
|
uint8* dst_argb, const float* poly,
|
||||||
|
int width);
|
||||||
|
|
||||||
// Divide num by div and return as 16.16 fixed point result.
|
// Divide num by div and return as 16.16 fixed point result.
|
||||||
int FixedDiv_C(int num, int div);
|
int FixedDiv_C(int num, int div);
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 779
|
#define LIBYUV_VERSION 780
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||||
|
|||||||
@ -2055,6 +2055,11 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
|
|||||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||||
ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
|
ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
|
||||||
|
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 2)) {
|
||||||
|
ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
for (int y = 0; y < height; ++y) {
|
for (int y = 0; y < height; ++y) {
|
||||||
ARGBPolynomialRow(src_argb, dst_argb, poly, width);
|
ARGBPolynomialRow(src_argb, dst_argb, poly, width);
|
||||||
|
|||||||
@ -2042,10 +2042,10 @@ void ARGBPolynomialRow_C(const uint8* src_argb,
|
|||||||
dr += poly[14] * r3;
|
dr += poly[14] * r3;
|
||||||
da += poly[15] * a3;
|
da += poly[15] * a3;
|
||||||
|
|
||||||
dst_argb[0]= Clamp(static_cast<int32>(db));
|
dst_argb[0] = Clamp(static_cast<int32>(db));
|
||||||
dst_argb[1]= Clamp(static_cast<int32>(dg));
|
dst_argb[1] = Clamp(static_cast<int32>(dg));
|
||||||
dst_argb[2]= Clamp(static_cast<int32>(dr));
|
dst_argb[2] = Clamp(static_cast<int32>(dr));
|
||||||
dst_argb[3]= Clamp(static_cast<int32>(da));
|
dst_argb[3] = Clamp(static_cast<int32>(da));
|
||||||
src_argb += 4;
|
src_argb += 4;
|
||||||
dst_argb += 4;
|
dst_argb += 4;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -6783,7 +6783,7 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
|
|||||||
mov eax, [esp + 4] /* src_argb */
|
mov eax, [esp + 4] /* src_argb */
|
||||||
mov edx, [esp + 8] /* dst_argb */
|
mov edx, [esp + 8] /* dst_argb */
|
||||||
mov ecx, [esp + 16] /* width */
|
mov ecx, [esp + 16] /* width */
|
||||||
pxor xmm3, xmm3 // 4 bytes to 4 ints
|
pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
|
||||||
|
|
||||||
align 16
|
align 16
|
||||||
convertloop:
|
convertloop:
|
||||||
@ -6814,6 +6814,66 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
|
|||||||
}
|
}
|
||||||
#endif // HAS_ARGBPOLYNOMIALROW_SSE2
|
#endif // HAS_ARGBPOLYNOMIALROW_SSE2
|
||||||
|
|
||||||
|
#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
|
||||||
|
__declspec(naked) __declspec(align(16))
|
||||||
|
void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
||||||
|
uint8* dst_argb, const float* poly,
|
||||||
|
int width) {
|
||||||
|
__asm {
|
||||||
|
mov eax, [esp + 12] /* poly */
|
||||||
|
vmovdqu xmm4, [eax]
|
||||||
|
vmovdqu xmm5, [eax + 16]
|
||||||
|
vmovdqu xmm6, [eax + 32]
|
||||||
|
vmovdqu xmm7, [eax + 48]
|
||||||
|
vpermq ymm4, ymm4, 0x44 // dup low qwords to high qwords
|
||||||
|
vpermq ymm5, ymm5, 0x44
|
||||||
|
vpermq ymm6, ymm6, 0x44
|
||||||
|
vpermq ymm7, ymm7, 0x44
|
||||||
|
|
||||||
|
mov eax, [esp + 4] /* src_argb */
|
||||||
|
mov edx, [esp + 8] /* dst_argb */
|
||||||
|
mov ecx, [esp + 16] /* width */
|
||||||
|
vpxor ymm3, ymm3, ymm3 // 0 constant for zero extending bytes to ints.
|
||||||
|
|
||||||
|
align 16
|
||||||
|
convertloop:
|
||||||
|
vmovq xmm0, qword ptr [eax] // 2 BGRA pixels
|
||||||
|
lea eax, [eax + 8]
|
||||||
|
|
||||||
|
// vpmovzxbd ymm0, ymm0
|
||||||
|
// TODO(fbarchard): Consider vex256 to avoid vpermq.
|
||||||
|
vpunpcklbw xmm0, xmm0, xmm3 // b0g0r0a0_b0g0r0a0_00000000_00000000
|
||||||
|
vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_00000000_b0g0r0a0_00000000
|
||||||
|
vpunpcklwd ymm0, ymm0, ymm3 // b000g000_r000a000_b000g000_r000a000
|
||||||
|
|
||||||
|
vcvtdq2ps ymm0, ymm0 // 8 floats
|
||||||
|
vmovdqa ymm1, ymm0 // X
|
||||||
|
vmulps ymm0, ymm0, ymm5 // C1 * X
|
||||||
|
vaddps ymm0, ymm0, ymm4 // result = C0 + C1 * X
|
||||||
|
vmovdqa ymm2, ymm1
|
||||||
|
vmulps ymm2, ymm2, ymm1 // X * X
|
||||||
|
vmulps ymm1, ymm1, ymm2 // X * X * X
|
||||||
|
vmulps ymm2, ymm2, ymm6 // C2 * X * X
|
||||||
|
vmulps ymm1, ymm1, ymm7 // C3 * X * X * X
|
||||||
|
vaddps ymm0, ymm0, ymm2 // result += C2 * X * X
|
||||||
|
vaddps ymm0, ymm0, ymm1 // result += C3 * X * X * X
|
||||||
|
vcvttps2dq ymm0, ymm0
|
||||||
|
|
||||||
|
// vpmovzxdb ymm0, ymm0 // b000g000_r000a000_b000g000_r000a000
|
||||||
|
vpackusdw ymm0, ymm0, ymm3 // b0g0r0a0_00000000_b0g0r0a0_00000000
|
||||||
|
vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
|
||||||
|
vpackuswb xmm0, xmm0, xmm3 // b0g0r0a0_b0g0r0a0_00000000_00000000
|
||||||
|
|
||||||
|
sub ecx, 2
|
||||||
|
vmovq qword ptr [edx], xmm0
|
||||||
|
lea edx, [edx + 8]
|
||||||
|
jg convertloop
|
||||||
|
vzeroupper
|
||||||
|
ret
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif // HAS_ARGBPOLYNOMIALROW_AVX2
|
||||||
|
|
||||||
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|||||||
@ -1665,7 +1665,7 @@ TEST_F(libyuvTest, TestARGBPolynomial) {
|
|||||||
0.584500f, 1.112000f, 1.535000f, 1.f, // C1 x
|
0.584500f, 1.112000f, 1.535000f, 1.f, // C1 x
|
||||||
0.001313f, -0.002503f, -0.004496f, 0.f, // C2 x * x
|
0.001313f, -0.002503f, -0.004496f, 0.f, // C2 x * x
|
||||||
0.0f, 0.000006965f, 0.000008781f, 0.f, // C3 x * x * x
|
0.0f, 0.000006965f, 0.000008781f, 0.f, // C3 x * x * x
|
||||||
};
|
};
|
||||||
|
|
||||||
// Test blue
|
// Test blue
|
||||||
orig_pixels[0][0] = 255u;
|
orig_pixels[0][0] = 255u;
|
||||||
|
|||||||
@ -10,7 +10,7 @@
|
|||||||
|
|
||||||
// Get PSNR for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
|
// Get PSNR for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
|
||||||
|
|
||||||
#ifndef UTIL_PSNR_H_
|
#ifndef UTIL_PSNR_H_ // NOLINT
|
||||||
#define UTIL_PSNR_H_
|
#define UTIL_PSNR_H_
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
@ -36,4 +36,4 @@ double ComputeSumSquareError(const uint8* org, const uint8* rec, int size);
|
|||||||
} // extern "C"
|
} // extern "C"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif // UTIL_PSNR_H_
|
#endif // UTIL_PSNR_H_ // NOLINT
|
||||||
|
|||||||
@ -8,7 +8,7 @@
|
|||||||
* be found in the AUTHORS file in the root of the source tree.
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "./ssim.h"
|
#include "../util/ssim.h" // NOLINT
|
||||||
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|||||||
@ -10,7 +10,7 @@
|
|||||||
|
|
||||||
// Get SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
|
// Get SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
|
||||||
|
|
||||||
#ifndef UTIL_SSIM_H_
|
#ifndef UTIL_SSIM_H_ // NOLINT
|
||||||
#define UTIL_SSIM_H_
|
#define UTIL_SSIM_H_
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
@ -32,4 +32,4 @@ double CalcLSSIM(double ssim);
|
|||||||
} // extern "C"
|
} // extern "C"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif // UTIL_SSIM_H_
|
#endif // UTIL_SSIM_H_ // NOLINT
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user