mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 09:16:48 +08:00
yasm ALIGN uppercase
BUG=none TEST=untested R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/4769005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@885 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
545a51c1d3
commit
04f40278df
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 884
|
Version: 885
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -1,168 +1,168 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||||
*
|
*
|
||||||
* Use of this source code is governed by a BSD-style license
|
* Use of this source code is governed by a BSD-style license
|
||||||
* that can be found in the LICENSE file in the root of the source
|
* that can be found in the LICENSE file in the root of the source
|
||||||
* tree. An additional intellectual property rights grant can be found
|
* tree. An additional intellectual property rights grant can be found
|
||||||
* in the file PATENTS. All contributing project authors may
|
* in the file PATENTS. All contributing project authors may
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ // NOLINT
|
#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ // NOLINT
|
||||||
#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
|
#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
|
||||||
|
|
||||||
#include "libyuv/basic_types.h"
|
#include "libyuv/basic_types.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
namespace libyuv {
|
namespace libyuv {
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Copy ARGB to ARGB.
|
// Copy ARGB to ARGB.
|
||||||
#define ARGBToARGB ARGBCopy
|
#define ARGBToARGB ARGBCopy
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBCopy(const uint8* src_argb, int src_stride_argb,
|
int ARGBCopy(const uint8* src_argb, int src_stride_argb,
|
||||||
uint8* dst_argb, int dst_stride_argb,
|
uint8* dst_argb, int dst_stride_argb,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB To BGRA. (alias)
|
// Convert ARGB To BGRA. (alias)
|
||||||
#define ARGBToBGRA BGRAToARGB
|
#define ARGBToBGRA BGRAToARGB
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
|
int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
|
||||||
uint8* dst_argb, int dst_stride_argb,
|
uint8* dst_argb, int dst_stride_argb,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB To ABGR. (alias)
|
// Convert ARGB To ABGR. (alias)
|
||||||
#define ARGBToABGR ABGRToARGB
|
#define ARGBToABGR ABGRToARGB
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
|
int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
|
||||||
uint8* dst_argb, int dst_stride_argb,
|
uint8* dst_argb, int dst_stride_argb,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB To RGBA.
|
// Convert ARGB To RGBA.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
|
int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
|
||||||
uint8* dst_argb, int dst_stride_argb,
|
uint8* dst_argb, int dst_stride_argb,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB To RGB24.
|
// Convert ARGB To RGB24.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
|
int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
|
||||||
uint8* dst_rgb24, int dst_stride_rgb24,
|
uint8* dst_rgb24, int dst_stride_rgb24,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB To RAW.
|
// Convert ARGB To RAW.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
|
int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
|
||||||
uint8* dst_rgb, int dst_stride_rgb,
|
uint8* dst_rgb, int dst_stride_rgb,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB To RGB565.
|
// Convert ARGB To RGB565.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
|
int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
|
||||||
uint8* dst_rgb565, int dst_stride_rgb565,
|
uint8* dst_rgb565, int dst_stride_rgb565,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB To ARGB1555.
|
// Convert ARGB To ARGB1555.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
|
int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
|
||||||
uint8* dst_argb1555, int dst_stride_argb1555,
|
uint8* dst_argb1555, int dst_stride_argb1555,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB To ARGB4444.
|
// Convert ARGB To ARGB4444.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
|
int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
|
||||||
uint8* dst_argb4444, int dst_stride_argb4444,
|
uint8* dst_argb4444, int dst_stride_argb4444,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB To I444.
|
// Convert ARGB To I444.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBToI444(const uint8* src_argb, int src_stride_argb,
|
int ARGBToI444(const uint8* src_argb, int src_stride_argb,
|
||||||
uint8* dst_y, int dst_stride_y,
|
uint8* dst_y, int dst_stride_y,
|
||||||
uint8* dst_u, int dst_stride_u,
|
uint8* dst_u, int dst_stride_u,
|
||||||
uint8* dst_v, int dst_stride_v,
|
uint8* dst_v, int dst_stride_v,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB To I422.
|
// Convert ARGB To I422.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBToI422(const uint8* src_argb, int src_stride_argb,
|
int ARGBToI422(const uint8* src_argb, int src_stride_argb,
|
||||||
uint8* dst_y, int dst_stride_y,
|
uint8* dst_y, int dst_stride_y,
|
||||||
uint8* dst_u, int dst_stride_u,
|
uint8* dst_u, int dst_stride_u,
|
||||||
uint8* dst_v, int dst_stride_v,
|
uint8* dst_v, int dst_stride_v,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB To I420. (also in convert.h)
|
// Convert ARGB To I420. (also in convert.h)
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBToI420(const uint8* src_argb, int src_stride_argb,
|
int ARGBToI420(const uint8* src_argb, int src_stride_argb,
|
||||||
uint8* dst_y, int dst_stride_y,
|
uint8* dst_y, int dst_stride_y,
|
||||||
uint8* dst_u, int dst_stride_u,
|
uint8* dst_u, int dst_stride_u,
|
||||||
uint8* dst_v, int dst_stride_v,
|
uint8* dst_v, int dst_stride_v,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB to J420. (JPeg full range I420).
|
// Convert ARGB to J420. (JPeg full range I420).
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
|
int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
|
||||||
uint8* dst_yj, int dst_stride_yj,
|
uint8* dst_yj, int dst_stride_yj,
|
||||||
uint8* dst_u, int dst_stride_u,
|
uint8* dst_u, int dst_stride_u,
|
||||||
uint8* dst_v, int dst_stride_v,
|
uint8* dst_v, int dst_stride_v,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB To I411.
|
// Convert ARGB To I411.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBToI411(const uint8* src_argb, int src_stride_argb,
|
int ARGBToI411(const uint8* src_argb, int src_stride_argb,
|
||||||
uint8* dst_y, int dst_stride_y,
|
uint8* dst_y, int dst_stride_y,
|
||||||
uint8* dst_u, int dst_stride_u,
|
uint8* dst_u, int dst_stride_u,
|
||||||
uint8* dst_v, int dst_stride_v,
|
uint8* dst_v, int dst_stride_v,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB to J400. (JPeg full range).
|
// Convert ARGB to J400. (JPeg full range).
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
|
int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
|
||||||
uint8* dst_yj, int dst_stride_yj,
|
uint8* dst_yj, int dst_stride_yj,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB to I400.
|
// Convert ARGB to I400.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBToI400(const uint8* src_argb, int src_stride_argb,
|
int ARGBToI400(const uint8* src_argb, int src_stride_argb,
|
||||||
uint8* dst_y, int dst_stride_y,
|
uint8* dst_y, int dst_stride_y,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB To NV12.
|
// Convert ARGB To NV12.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
|
int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
|
||||||
uint8* dst_y, int dst_stride_y,
|
uint8* dst_y, int dst_stride_y,
|
||||||
uint8* dst_uv, int dst_stride_uv,
|
uint8* dst_uv, int dst_stride_uv,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB To NV21.
|
// Convert ARGB To NV21.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
|
int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
|
||||||
uint8* dst_y, int dst_stride_y,
|
uint8* dst_y, int dst_stride_y,
|
||||||
uint8* dst_vu, int dst_stride_vu,
|
uint8* dst_vu, int dst_stride_vu,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB To NV21.
|
// Convert ARGB To NV21.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
|
int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
|
||||||
uint8* dst_y, int dst_stride_y,
|
uint8* dst_y, int dst_stride_y,
|
||||||
uint8* dst_vu, int dst_stride_vu,
|
uint8* dst_vu, int dst_stride_vu,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB To YUY2.
|
// Convert ARGB To YUY2.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
|
int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
|
||||||
uint8* dst_yuy2, int dst_stride_yuy2,
|
uint8* dst_yuy2, int dst_stride_yuy2,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
// Convert ARGB To UYVY.
|
// Convert ARGB To UYVY.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
|
int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
|
||||||
uint8* dst_uyvy, int dst_stride_uyvy,
|
uint8* dst_uyvy, int dst_stride_uyvy,
|
||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
} // namespace libyuv
|
} // namespace libyuv
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ NOLINT
|
#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ NOLINT
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 884
|
#define LIBYUV_VERSION 885
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||||
|
|||||||
@ -1,40 +1,40 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||||
*
|
*
|
||||||
* Use of this source code is governed by a BSD-style license
|
* Use of this source code is governed by a BSD-style license
|
||||||
* that can be found in the LICENSE file in the root of the source
|
* that can be found in the LICENSE file in the root of the source
|
||||||
* tree. An additional intellectual property rights grant can be found
|
* tree. An additional intellectual property rights grant can be found
|
||||||
* in the file PATENTS. All contributing project authors may
|
* in the file PATENTS. All contributing project authors may
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "libyuv/basic_types.h"
|
#include "libyuv/basic_types.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
namespace libyuv {
|
namespace libyuv {
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
|
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
|
||||||
uint32 sse = 0u;
|
uint32 sse = 0u;
|
||||||
for (int i = 0; i < count; ++i) {
|
for (int i = 0; i < count; ++i) {
|
||||||
int diff = src_a[i] - src_b[i];
|
int diff = src_a[i] - src_b[i];
|
||||||
sse += static_cast<uint32>(diff * diff);
|
sse += static_cast<uint32>(diff * diff);
|
||||||
}
|
}
|
||||||
return sse;
|
return sse;
|
||||||
}
|
}
|
||||||
|
|
||||||
// hash seed of 5381 recommended.
|
// hash seed of 5381 recommended.
|
||||||
// Internal C version of HashDjb2 with int sized count for efficiency.
|
// Internal C version of HashDjb2 with int sized count for efficiency.
|
||||||
uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
|
uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
|
||||||
uint32 hash = seed;
|
uint32 hash = seed;
|
||||||
for (int i = 0; i < count; ++i) {
|
for (int i = 0; i < count; ++i) {
|
||||||
hash += (hash << 5) + src[i];
|
hash += (hash << 5) + src[i];
|
||||||
}
|
}
|
||||||
return hash;
|
return hash;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
} // namespace libyuv
|
} // namespace libyuv
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -1,61 +1,61 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||||
*
|
*
|
||||||
* Use of this source code is governed by a BSD-style license
|
* Use of this source code is governed by a BSD-style license
|
||||||
* that can be found in the LICENSE file in the root of the source
|
* that can be found in the LICENSE file in the root of the source
|
||||||
* tree. An additional intellectual property rights grant can be found
|
* tree. An additional intellectual property rights grant can be found
|
||||||
* in the file PATENTS. All contributing project authors may
|
* in the file PATENTS. All contributing project authors may
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "libyuv/basic_types.h"
|
#include "libyuv/basic_types.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
namespace libyuv {
|
namespace libyuv {
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
|
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
|
||||||
|
|
||||||
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||||
volatile uint32 sse;
|
volatile uint32 sse;
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"vmov.u8 q8, #0 \n"
|
"vmov.u8 q8, #0 \n"
|
||||||
"vmov.u8 q10, #0 \n"
|
"vmov.u8 q10, #0 \n"
|
||||||
"vmov.u8 q9, #0 \n"
|
"vmov.u8 q9, #0 \n"
|
||||||
"vmov.u8 q11, #0 \n"
|
"vmov.u8 q11, #0 \n"
|
||||||
|
|
||||||
".p2align 2 \n"
|
".p2align 2 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q0}, [%0]! \n"
|
"vld1.8 {q0}, [%0]! \n"
|
||||||
"vld1.8 {q1}, [%1]! \n"
|
"vld1.8 {q1}, [%1]! \n"
|
||||||
"subs %2, %2, #16 \n"
|
"subs %2, %2, #16 \n"
|
||||||
"vsubl.u8 q2, d0, d2 \n"
|
"vsubl.u8 q2, d0, d2 \n"
|
||||||
"vsubl.u8 q3, d1, d3 \n"
|
"vsubl.u8 q3, d1, d3 \n"
|
||||||
"vmlal.s16 q8, d4, d4 \n"
|
"vmlal.s16 q8, d4, d4 \n"
|
||||||
"vmlal.s16 q9, d6, d6 \n"
|
"vmlal.s16 q9, d6, d6 \n"
|
||||||
"vmlal.s16 q10, d5, d5 \n"
|
"vmlal.s16 q10, d5, d5 \n"
|
||||||
"vmlal.s16 q11, d7, d7 \n"
|
"vmlal.s16 q11, d7, d7 \n"
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
|
|
||||||
"vadd.u32 q8, q8, q9 \n"
|
"vadd.u32 q8, q8, q9 \n"
|
||||||
"vadd.u32 q10, q10, q11 \n"
|
"vadd.u32 q10, q10, q11 \n"
|
||||||
"vadd.u32 q11, q8, q10 \n"
|
"vadd.u32 q11, q8, q10 \n"
|
||||||
"vpaddl.u32 q1, q11 \n"
|
"vpaddl.u32 q1, q11 \n"
|
||||||
"vadd.u64 d0, d2, d3 \n"
|
"vadd.u64 d0, d2, d3 \n"
|
||||||
"vmov.32 %3, d0[0] \n"
|
"vmov.32 %3, d0[0] \n"
|
||||||
: "+r"(src_a),
|
: "+r"(src_a),
|
||||||
"+r"(src_b),
|
"+r"(src_b),
|
||||||
"+r"(count),
|
"+r"(count),
|
||||||
"=r"(sse)
|
"=r"(sse)
|
||||||
:
|
:
|
||||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
|
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
|
||||||
return sse;
|
return sse;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // __ARM_NEON__
|
#endif // __ARM_NEON__
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
} // namespace libyuv
|
} // namespace libyuv
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -1,166 +1,166 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||||
*
|
*
|
||||||
* Use of this source code is governed by a BSD-style license
|
* Use of this source code is governed by a BSD-style license
|
||||||
* that can be found in the LICENSE file in the root of the source
|
* that can be found in the LICENSE file in the root of the source
|
||||||
* tree. An additional intellectual property rights grant can be found
|
* tree. An additional intellectual property rights grant can be found
|
||||||
* in the file PATENTS. All contributing project authors may
|
* in the file PATENTS. All contributing project authors may
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "libyuv/basic_types.h"
|
#include "libyuv/basic_types.h"
|
||||||
#include "libyuv/row.h"
|
#include "libyuv/row.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
namespace libyuv {
|
namespace libyuv {
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
|
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
|
||||||
|
|
||||||
#if defined(__native_client__) && defined(__x86_64__)
|
#if defined(__native_client__) && defined(__x86_64__)
|
||||||
#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
|
#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
|
||||||
#define MEMLEA(offset, base) #offset "(%q" #base ")"
|
#define MEMLEA(offset, base) #offset "(%q" #base ")"
|
||||||
#else
|
#else
|
||||||
#define MEMACCESS(base) "(%" #base ")"
|
#define MEMACCESS(base) "(%" #base ")"
|
||||||
#define MEMLEA(offset, base) #offset "(%" #base ")"
|
#define MEMLEA(offset, base) #offset "(%" #base ")"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
||||||
uint32 sse;
|
uint32 sse;
|
||||||
asm volatile ( // NOLINT
|
asm volatile ( // NOLINT
|
||||||
"pxor %%xmm0,%%xmm0 \n"
|
"pxor %%xmm0,%%xmm0 \n"
|
||||||
"pxor %%xmm5,%%xmm5 \n"
|
"pxor %%xmm5,%%xmm5 \n"
|
||||||
".p2align 2 \n"
|
".p2align 2 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa " MEMACCESS(0) ",%%xmm1 \n"
|
"movdqa " MEMACCESS(0) ",%%xmm1 \n"
|
||||||
"lea " MEMLEA(0x10, 0) ",%0 \n"
|
"lea " MEMLEA(0x10, 0) ",%0 \n"
|
||||||
"movdqa " MEMACCESS(1) ",%%xmm2 \n"
|
"movdqa " MEMACCESS(1) ",%%xmm2 \n"
|
||||||
"lea " MEMLEA(0x10, 1) ",%1 \n"
|
"lea " MEMLEA(0x10, 1) ",%1 \n"
|
||||||
"sub $0x10,%2 \n"
|
"sub $0x10,%2 \n"
|
||||||
"movdqa %%xmm1,%%xmm3 \n"
|
"movdqa %%xmm1,%%xmm3 \n"
|
||||||
"psubusb %%xmm2,%%xmm1 \n"
|
"psubusb %%xmm2,%%xmm1 \n"
|
||||||
"psubusb %%xmm3,%%xmm2 \n"
|
"psubusb %%xmm3,%%xmm2 \n"
|
||||||
"por %%xmm2,%%xmm1 \n"
|
"por %%xmm2,%%xmm1 \n"
|
||||||
"movdqa %%xmm1,%%xmm2 \n"
|
"movdqa %%xmm1,%%xmm2 \n"
|
||||||
"punpcklbw %%xmm5,%%xmm1 \n"
|
"punpcklbw %%xmm5,%%xmm1 \n"
|
||||||
"punpckhbw %%xmm5,%%xmm2 \n"
|
"punpckhbw %%xmm5,%%xmm2 \n"
|
||||||
"pmaddwd %%xmm1,%%xmm1 \n"
|
"pmaddwd %%xmm1,%%xmm1 \n"
|
||||||
"pmaddwd %%xmm2,%%xmm2 \n"
|
"pmaddwd %%xmm2,%%xmm2 \n"
|
||||||
"paddd %%xmm1,%%xmm0 \n"
|
"paddd %%xmm1,%%xmm0 \n"
|
||||||
"paddd %%xmm2,%%xmm0 \n"
|
"paddd %%xmm2,%%xmm0 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
|
|
||||||
"pshufd $0xee,%%xmm0,%%xmm1 \n"
|
"pshufd $0xee,%%xmm0,%%xmm1 \n"
|
||||||
"paddd %%xmm1,%%xmm0 \n"
|
"paddd %%xmm1,%%xmm0 \n"
|
||||||
"pshufd $0x1,%%xmm0,%%xmm1 \n"
|
"pshufd $0x1,%%xmm0,%%xmm1 \n"
|
||||||
"paddd %%xmm1,%%xmm0 \n"
|
"paddd %%xmm1,%%xmm0 \n"
|
||||||
"movd %%xmm0,%3 \n"
|
"movd %%xmm0,%3 \n"
|
||||||
|
|
||||||
: "+r"(src_a), // %0
|
: "+r"(src_a), // %0
|
||||||
"+r"(src_b), // %1
|
"+r"(src_b), // %1
|
||||||
"+r"(count), // %2
|
"+r"(count), // %2
|
||||||
"=g"(sse) // %3
|
"=g"(sse) // %3
|
||||||
:
|
:
|
||||||
: "memory", "cc"
|
: "memory", "cc"
|
||||||
#if defined(__SSE2__)
|
#if defined(__SSE2__)
|
||||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
||||||
#endif
|
#endif
|
||||||
); // NOLINT
|
); // NOLINT
|
||||||
return sse;
|
return sse;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // defined(__x86_64__) || defined(__i386__)
|
#endif // defined(__x86_64__) || defined(__i386__)
|
||||||
|
|
||||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||||
(defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
|
(defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
|
||||||
#define HAS_HASHDJB2_SSE41
|
#define HAS_HASHDJB2_SSE41
|
||||||
static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
|
static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
|
||||||
static uvec32 kHashMul0 = {
|
static uvec32 kHashMul0 = {
|
||||||
0x0c3525e1, // 33 ^ 15
|
0x0c3525e1, // 33 ^ 15
|
||||||
0xa3476dc1, // 33 ^ 14
|
0xa3476dc1, // 33 ^ 14
|
||||||
0x3b4039a1, // 33 ^ 13
|
0x3b4039a1, // 33 ^ 13
|
||||||
0x4f5f0981, // 33 ^ 12
|
0x4f5f0981, // 33 ^ 12
|
||||||
};
|
};
|
||||||
static uvec32 kHashMul1 = {
|
static uvec32 kHashMul1 = {
|
||||||
0x30f35d61, // 33 ^ 11
|
0x30f35d61, // 33 ^ 11
|
||||||
0x855cb541, // 33 ^ 10
|
0x855cb541, // 33 ^ 10
|
||||||
0x040a9121, // 33 ^ 9
|
0x040a9121, // 33 ^ 9
|
||||||
0x747c7101, // 33 ^ 8
|
0x747c7101, // 33 ^ 8
|
||||||
};
|
};
|
||||||
static uvec32 kHashMul2 = {
|
static uvec32 kHashMul2 = {
|
||||||
0xec41d4e1, // 33 ^ 7
|
0xec41d4e1, // 33 ^ 7
|
||||||
0x4cfa3cc1, // 33 ^ 6
|
0x4cfa3cc1, // 33 ^ 6
|
||||||
0x025528a1, // 33 ^ 5
|
0x025528a1, // 33 ^ 5
|
||||||
0x00121881, // 33 ^ 4
|
0x00121881, // 33 ^ 4
|
||||||
};
|
};
|
||||||
static uvec32 kHashMul3 = {
|
static uvec32 kHashMul3 = {
|
||||||
0x00008c61, // 33 ^ 3
|
0x00008c61, // 33 ^ 3
|
||||||
0x00000441, // 33 ^ 2
|
0x00000441, // 33 ^ 2
|
||||||
0x00000021, // 33 ^ 1
|
0x00000021, // 33 ^ 1
|
||||||
0x00000001, // 33 ^ 0
|
0x00000001, // 33 ^ 0
|
||||||
};
|
};
|
||||||
|
|
||||||
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
||||||
uint32 hash;
|
uint32 hash;
|
||||||
asm volatile ( // NOLINT
|
asm volatile ( // NOLINT
|
||||||
"movd %2,%%xmm0 \n"
|
"movd %2,%%xmm0 \n"
|
||||||
"pxor %%xmm7,%%xmm7 \n"
|
"pxor %%xmm7,%%xmm7 \n"
|
||||||
"movdqa %4,%%xmm6 \n"
|
"movdqa %4,%%xmm6 \n"
|
||||||
".p2align 2 \n"
|
".p2align 2 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu " MEMACCESS(0) ",%%xmm1 \n"
|
"movdqu " MEMACCESS(0) ",%%xmm1 \n"
|
||||||
"lea " MEMLEA(0x10, 0) ",%0 \n"
|
"lea " MEMLEA(0x10, 0) ",%0 \n"
|
||||||
"pmulld %%xmm6,%%xmm0 \n"
|
"pmulld %%xmm6,%%xmm0 \n"
|
||||||
"movdqa %5,%%xmm5 \n"
|
"movdqa %5,%%xmm5 \n"
|
||||||
"movdqa %%xmm1,%%xmm2 \n"
|
"movdqa %%xmm1,%%xmm2 \n"
|
||||||
"punpcklbw %%xmm7,%%xmm2 \n"
|
"punpcklbw %%xmm7,%%xmm2 \n"
|
||||||
"movdqa %%xmm2,%%xmm3 \n"
|
"movdqa %%xmm2,%%xmm3 \n"
|
||||||
"punpcklwd %%xmm7,%%xmm3 \n"
|
"punpcklwd %%xmm7,%%xmm3 \n"
|
||||||
"pmulld %%xmm5,%%xmm3 \n"
|
"pmulld %%xmm5,%%xmm3 \n"
|
||||||
"movdqa %6,%%xmm5 \n"
|
"movdqa %6,%%xmm5 \n"
|
||||||
"movdqa %%xmm2,%%xmm4 \n"
|
"movdqa %%xmm2,%%xmm4 \n"
|
||||||
"punpckhwd %%xmm7,%%xmm4 \n"
|
"punpckhwd %%xmm7,%%xmm4 \n"
|
||||||
"pmulld %%xmm5,%%xmm4 \n"
|
"pmulld %%xmm5,%%xmm4 \n"
|
||||||
"movdqa %7,%%xmm5 \n"
|
"movdqa %7,%%xmm5 \n"
|
||||||
"punpckhbw %%xmm7,%%xmm1 \n"
|
"punpckhbw %%xmm7,%%xmm1 \n"
|
||||||
"movdqa %%xmm1,%%xmm2 \n"
|
"movdqa %%xmm1,%%xmm2 \n"
|
||||||
"punpcklwd %%xmm7,%%xmm2 \n"
|
"punpcklwd %%xmm7,%%xmm2 \n"
|
||||||
"pmulld %%xmm5,%%xmm2 \n"
|
"pmulld %%xmm5,%%xmm2 \n"
|
||||||
"movdqa %8,%%xmm5 \n"
|
"movdqa %8,%%xmm5 \n"
|
||||||
"punpckhwd %%xmm7,%%xmm1 \n"
|
"punpckhwd %%xmm7,%%xmm1 \n"
|
||||||
"pmulld %%xmm5,%%xmm1 \n"
|
"pmulld %%xmm5,%%xmm1 \n"
|
||||||
"paddd %%xmm4,%%xmm3 \n"
|
"paddd %%xmm4,%%xmm3 \n"
|
||||||
"paddd %%xmm2,%%xmm1 \n"
|
"paddd %%xmm2,%%xmm1 \n"
|
||||||
"sub $0x10,%1 \n"
|
"sub $0x10,%1 \n"
|
||||||
"paddd %%xmm3,%%xmm1 \n"
|
"paddd %%xmm3,%%xmm1 \n"
|
||||||
"pshufd $0xe,%%xmm1,%%xmm2 \n"
|
"pshufd $0xe,%%xmm1,%%xmm2 \n"
|
||||||
"paddd %%xmm2,%%xmm1 \n"
|
"paddd %%xmm2,%%xmm1 \n"
|
||||||
"pshufd $0x1,%%xmm1,%%xmm2 \n"
|
"pshufd $0x1,%%xmm1,%%xmm2 \n"
|
||||||
"paddd %%xmm2,%%xmm1 \n"
|
"paddd %%xmm2,%%xmm1 \n"
|
||||||
"paddd %%xmm1,%%xmm0 \n"
|
"paddd %%xmm1,%%xmm0 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
"movd %%xmm0,%3 \n"
|
"movd %%xmm0,%3 \n"
|
||||||
: "+r"(src), // %0
|
: "+r"(src), // %0
|
||||||
"+r"(count), // %1
|
"+r"(count), // %1
|
||||||
"+rm"(seed), // %2
|
"+rm"(seed), // %2
|
||||||
"=g"(hash) // %3
|
"=g"(hash) // %3
|
||||||
: "m"(kHash16x33), // %4
|
: "m"(kHash16x33), // %4
|
||||||
"m"(kHashMul0), // %5
|
"m"(kHashMul0), // %5
|
||||||
"m"(kHashMul1), // %6
|
"m"(kHashMul1), // %6
|
||||||
"m"(kHashMul2), // %7
|
"m"(kHashMul2), // %7
|
||||||
"m"(kHashMul3) // %8
|
"m"(kHashMul3) // %8
|
||||||
: "memory", "cc"
|
: "memory", "cc"
|
||||||
#if defined(__SSE2__)
|
#if defined(__SSE2__)
|
||||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
||||||
#endif
|
#endif
|
||||||
); // NOLINT
|
); // NOLINT
|
||||||
return hash;
|
return hash;
|
||||||
}
|
}
|
||||||
#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
|
#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
} // namespace libyuv
|
} // namespace libyuv
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@ -1,232 +1,232 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||||
*
|
*
|
||||||
* Use of this source code is governed by a BSD-style license
|
* Use of this source code is governed by a BSD-style license
|
||||||
* that can be found in the LICENSE file in the root of the source
|
* that can be found in the LICENSE file in the root of the source
|
||||||
* tree. An additional intellectual property rights grant can be found
|
* tree. An additional intellectual property rights grant can be found
|
||||||
* in the file PATENTS. All contributing project authors may
|
* in the file PATENTS. All contributing project authors may
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "libyuv/basic_types.h"
|
#include "libyuv/basic_types.h"
|
||||||
#include "libyuv/row.h"
|
#include "libyuv/row.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
namespace libyuv {
|
namespace libyuv {
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||||||
|
|
||||||
__declspec(naked) __declspec(align(16))
|
__declspec(naked) __declspec(align(16))
|
||||||
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
||||||
__asm {
|
__asm {
|
||||||
mov eax, [esp + 4] // src_a
|
mov eax, [esp + 4] // src_a
|
||||||
mov edx, [esp + 8] // src_b
|
mov edx, [esp + 8] // src_b
|
||||||
mov ecx, [esp + 12] // count
|
mov ecx, [esp + 12] // count
|
||||||
pxor xmm0, xmm0
|
pxor xmm0, xmm0
|
||||||
pxor xmm5, xmm5
|
pxor xmm5, xmm5
|
||||||
|
|
||||||
align 4
|
align 4
|
||||||
wloop:
|
wloop:
|
||||||
movdqa xmm1, [eax]
|
movdqa xmm1, [eax]
|
||||||
lea eax, [eax + 16]
|
lea eax, [eax + 16]
|
||||||
movdqa xmm2, [edx]
|
movdqa xmm2, [edx]
|
||||||
lea edx, [edx + 16]
|
lea edx, [edx + 16]
|
||||||
sub ecx, 16
|
sub ecx, 16
|
||||||
movdqa xmm3, xmm1 // abs trick
|
movdqa xmm3, xmm1 // abs trick
|
||||||
psubusb xmm1, xmm2
|
psubusb xmm1, xmm2
|
||||||
psubusb xmm2, xmm3
|
psubusb xmm2, xmm3
|
||||||
por xmm1, xmm2
|
por xmm1, xmm2
|
||||||
movdqa xmm2, xmm1
|
movdqa xmm2, xmm1
|
||||||
punpcklbw xmm1, xmm5
|
punpcklbw xmm1, xmm5
|
||||||
punpckhbw xmm2, xmm5
|
punpckhbw xmm2, xmm5
|
||||||
pmaddwd xmm1, xmm1
|
pmaddwd xmm1, xmm1
|
||||||
pmaddwd xmm2, xmm2
|
pmaddwd xmm2, xmm2
|
||||||
paddd xmm0, xmm1
|
paddd xmm0, xmm1
|
||||||
paddd xmm0, xmm2
|
paddd xmm0, xmm2
|
||||||
jg wloop
|
jg wloop
|
||||||
|
|
||||||
pshufd xmm1, xmm0, 0xee
|
pshufd xmm1, xmm0, 0xee
|
||||||
paddd xmm0, xmm1
|
paddd xmm0, xmm1
|
||||||
pshufd xmm1, xmm0, 0x01
|
pshufd xmm1, xmm0, 0x01
|
||||||
paddd xmm0, xmm1
|
paddd xmm0, xmm1
|
||||||
movd eax, xmm0
|
movd eax, xmm0
|
||||||
ret
|
ret
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Visual C 2012 required for AVX2.
|
// Visual C 2012 required for AVX2.
|
||||||
#if _MSC_VER >= 1700
|
#if _MSC_VER >= 1700
|
||||||
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
|
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
|
||||||
#pragma warning(disable: 4752)
|
#pragma warning(disable: 4752)
|
||||||
__declspec(naked) __declspec(align(16))
|
__declspec(naked) __declspec(align(16))
|
||||||
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
|
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
|
||||||
__asm {
|
__asm {
|
||||||
mov eax, [esp + 4] // src_a
|
mov eax, [esp + 4] // src_a
|
||||||
mov edx, [esp + 8] // src_b
|
mov edx, [esp + 8] // src_b
|
||||||
mov ecx, [esp + 12] // count
|
mov ecx, [esp + 12] // count
|
||||||
vpxor ymm0, ymm0, ymm0 // sum
|
vpxor ymm0, ymm0, ymm0 // sum
|
||||||
vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
|
vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
|
||||||
sub edx, eax
|
sub edx, eax
|
||||||
|
|
||||||
align 4
|
align 4
|
||||||
wloop:
|
wloop:
|
||||||
vmovdqu ymm1, [eax]
|
vmovdqu ymm1, [eax]
|
||||||
vmovdqu ymm2, [eax + edx]
|
vmovdqu ymm2, [eax + edx]
|
||||||
lea eax, [eax + 32]
|
lea eax, [eax + 32]
|
||||||
sub ecx, 32
|
sub ecx, 32
|
||||||
vpsubusb ymm3, ymm1, ymm2 // abs difference trick
|
vpsubusb ymm3, ymm1, ymm2 // abs difference trick
|
||||||
vpsubusb ymm2, ymm2, ymm1
|
vpsubusb ymm2, ymm2, ymm1
|
||||||
vpor ymm1, ymm2, ymm3
|
vpor ymm1, ymm2, ymm3
|
||||||
vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
|
vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
|
||||||
vpunpckhbw ymm1, ymm1, ymm5
|
vpunpckhbw ymm1, ymm1, ymm5
|
||||||
vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
|
vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
|
||||||
vpmaddwd ymm1, ymm1, ymm1
|
vpmaddwd ymm1, ymm1, ymm1
|
||||||
vpaddd ymm0, ymm0, ymm1
|
vpaddd ymm0, ymm0, ymm1
|
||||||
vpaddd ymm0, ymm0, ymm2
|
vpaddd ymm0, ymm0, ymm2
|
||||||
jg wloop
|
jg wloop
|
||||||
|
|
||||||
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
|
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
|
||||||
vpaddd ymm0, ymm0, ymm1
|
vpaddd ymm0, ymm0, ymm1
|
||||||
vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
|
vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
|
||||||
vpaddd ymm0, ymm0, ymm1
|
vpaddd ymm0, ymm0, ymm1
|
||||||
vpermq ymm1, ymm0, 0x02 // high + low lane.
|
vpermq ymm1, ymm0, 0x02 // high + low lane.
|
||||||
vpaddd ymm0, ymm0, ymm1
|
vpaddd ymm0, ymm0, ymm1
|
||||||
vmovd eax, xmm0
|
vmovd eax, xmm0
|
||||||
vzeroupper
|
vzeroupper
|
||||||
ret
|
ret
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif // _MSC_VER >= 1700
|
#endif // _MSC_VER >= 1700
|
||||||
|
|
||||||
#define HAS_HASHDJB2_SSE41
|
#define HAS_HASHDJB2_SSE41
|
||||||
static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
|
static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
|
||||||
static uvec32 kHashMul0 = {
|
static uvec32 kHashMul0 = {
|
||||||
0x0c3525e1, // 33 ^ 15
|
0x0c3525e1, // 33 ^ 15
|
||||||
0xa3476dc1, // 33 ^ 14
|
0xa3476dc1, // 33 ^ 14
|
||||||
0x3b4039a1, // 33 ^ 13
|
0x3b4039a1, // 33 ^ 13
|
||||||
0x4f5f0981, // 33 ^ 12
|
0x4f5f0981, // 33 ^ 12
|
||||||
};
|
};
|
||||||
static uvec32 kHashMul1 = {
|
static uvec32 kHashMul1 = {
|
||||||
0x30f35d61, // 33 ^ 11
|
0x30f35d61, // 33 ^ 11
|
||||||
0x855cb541, // 33 ^ 10
|
0x855cb541, // 33 ^ 10
|
||||||
0x040a9121, // 33 ^ 9
|
0x040a9121, // 33 ^ 9
|
||||||
0x747c7101, // 33 ^ 8
|
0x747c7101, // 33 ^ 8
|
||||||
};
|
};
|
||||||
static uvec32 kHashMul2 = {
|
static uvec32 kHashMul2 = {
|
||||||
0xec41d4e1, // 33 ^ 7
|
0xec41d4e1, // 33 ^ 7
|
||||||
0x4cfa3cc1, // 33 ^ 6
|
0x4cfa3cc1, // 33 ^ 6
|
||||||
0x025528a1, // 33 ^ 5
|
0x025528a1, // 33 ^ 5
|
||||||
0x00121881, // 33 ^ 4
|
0x00121881, // 33 ^ 4
|
||||||
};
|
};
|
||||||
static uvec32 kHashMul3 = {
|
static uvec32 kHashMul3 = {
|
||||||
0x00008c61, // 33 ^ 3
|
0x00008c61, // 33 ^ 3
|
||||||
0x00000441, // 33 ^ 2
|
0x00000441, // 33 ^ 2
|
||||||
0x00000021, // 33 ^ 1
|
0x00000021, // 33 ^ 1
|
||||||
0x00000001, // 33 ^ 0
|
0x00000001, // 33 ^ 0
|
||||||
};
|
};
|
||||||
|
|
||||||
// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
|
// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
|
||||||
// 44: 66 0F 38 40 DD pmulld xmm3,xmm5
|
// 44: 66 0F 38 40 DD pmulld xmm3,xmm5
|
||||||
// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
|
// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
|
||||||
// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
|
// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
|
||||||
// 83: 66 0F 38 40 CD pmulld xmm1,xmm5
|
// 83: 66 0F 38 40 CD pmulld xmm1,xmm5
|
||||||
#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
|
#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
|
||||||
_asm _emit 0x40 _asm _emit reg
|
_asm _emit 0x40 _asm _emit reg
|
||||||
|
|
||||||
__declspec(naked) __declspec(align(16))
|
__declspec(naked) __declspec(align(16))
|
||||||
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
||||||
__asm {
|
__asm {
|
||||||
mov eax, [esp + 4] // src
|
mov eax, [esp + 4] // src
|
||||||
mov ecx, [esp + 8] // count
|
mov ecx, [esp + 8] // count
|
||||||
movd xmm0, [esp + 12] // seed
|
movd xmm0, [esp + 12] // seed
|
||||||
|
|
||||||
pxor xmm7, xmm7 // constant 0 for unpck
|
pxor xmm7, xmm7 // constant 0 for unpck
|
||||||
movdqa xmm6, kHash16x33
|
movdqa xmm6, kHash16x33
|
||||||
|
|
||||||
align 4
|
align 4
|
||||||
wloop:
|
wloop:
|
||||||
movdqu xmm1, [eax] // src[0-15]
|
movdqu xmm1, [eax] // src[0-15]
|
||||||
lea eax, [eax + 16]
|
lea eax, [eax + 16]
|
||||||
pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
|
pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
|
||||||
movdqa xmm5, kHashMul0
|
movdqa xmm5, kHashMul0
|
||||||
movdqa xmm2, xmm1
|
movdqa xmm2, xmm1
|
||||||
punpcklbw xmm2, xmm7 // src[0-7]
|
punpcklbw xmm2, xmm7 // src[0-7]
|
||||||
movdqa xmm3, xmm2
|
movdqa xmm3, xmm2
|
||||||
punpcklwd xmm3, xmm7 // src[0-3]
|
punpcklwd xmm3, xmm7 // src[0-3]
|
||||||
pmulld(0xdd) // pmulld xmm3, xmm5
|
pmulld(0xdd) // pmulld xmm3, xmm5
|
||||||
movdqa xmm5, kHashMul1
|
movdqa xmm5, kHashMul1
|
||||||
movdqa xmm4, xmm2
|
movdqa xmm4, xmm2
|
||||||
punpckhwd xmm4, xmm7 // src[4-7]
|
punpckhwd xmm4, xmm7 // src[4-7]
|
||||||
pmulld(0xe5) // pmulld xmm4, xmm5
|
pmulld(0xe5) // pmulld xmm4, xmm5
|
||||||
movdqa xmm5, kHashMul2
|
movdqa xmm5, kHashMul2
|
||||||
punpckhbw xmm1, xmm7 // src[8-15]
|
punpckhbw xmm1, xmm7 // src[8-15]
|
||||||
movdqa xmm2, xmm1
|
movdqa xmm2, xmm1
|
||||||
punpcklwd xmm2, xmm7 // src[8-11]
|
punpcklwd xmm2, xmm7 // src[8-11]
|
||||||
pmulld(0xd5) // pmulld xmm2, xmm5
|
pmulld(0xd5) // pmulld xmm2, xmm5
|
||||||
movdqa xmm5, kHashMul3
|
movdqa xmm5, kHashMul3
|
||||||
punpckhwd xmm1, xmm7 // src[12-15]
|
punpckhwd xmm1, xmm7 // src[12-15]
|
||||||
pmulld(0xcd) // pmulld xmm1, xmm5
|
pmulld(0xcd) // pmulld xmm1, xmm5
|
||||||
paddd xmm3, xmm4 // add 16 results
|
paddd xmm3, xmm4 // add 16 results
|
||||||
paddd xmm1, xmm2
|
paddd xmm1, xmm2
|
||||||
sub ecx, 16
|
sub ecx, 16
|
||||||
paddd xmm1, xmm3
|
paddd xmm1, xmm3
|
||||||
|
|
||||||
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
|
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
|
||||||
paddd xmm1, xmm2
|
paddd xmm1, xmm2
|
||||||
pshufd xmm2, xmm1, 0x01
|
pshufd xmm2, xmm1, 0x01
|
||||||
paddd xmm1, xmm2
|
paddd xmm1, xmm2
|
||||||
paddd xmm0, xmm1
|
paddd xmm0, xmm1
|
||||||
jg wloop
|
jg wloop
|
||||||
|
|
||||||
movd eax, xmm0 // return hash
|
movd eax, xmm0 // return hash
|
||||||
ret
|
ret
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Visual C 2012 required for AVX2.
|
// Visual C 2012 required for AVX2.
|
||||||
#if _MSC_VER >= 1700
|
#if _MSC_VER >= 1700
|
||||||
__declspec(naked) __declspec(align(16))
|
__declspec(naked) __declspec(align(16))
|
||||||
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
|
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
|
||||||
__asm {
|
__asm {
|
||||||
mov eax, [esp + 4] // src
|
mov eax, [esp + 4] // src
|
||||||
mov ecx, [esp + 8] // count
|
mov ecx, [esp + 8] // count
|
||||||
movd xmm0, [esp + 12] // seed
|
movd xmm0, [esp + 12] // seed
|
||||||
movdqa xmm6, kHash16x33
|
movdqa xmm6, kHash16x33
|
||||||
|
|
||||||
align 4
|
align 4
|
||||||
wloop:
|
wloop:
|
||||||
vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
|
vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
|
||||||
pmulld xmm0, xmm6 // hash *= 33 ^ 16
|
pmulld xmm0, xmm6 // hash *= 33 ^ 16
|
||||||
vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7]
|
vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7]
|
||||||
pmulld xmm3, kHashMul0
|
pmulld xmm3, kHashMul0
|
||||||
vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11]
|
vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11]
|
||||||
pmulld xmm4, kHashMul1
|
pmulld xmm4, kHashMul1
|
||||||
vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15]
|
vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15]
|
||||||
pmulld xmm2, kHashMul2
|
pmulld xmm2, kHashMul2
|
||||||
lea eax, [eax + 16]
|
lea eax, [eax + 16]
|
||||||
pmulld xmm1, kHashMul3
|
pmulld xmm1, kHashMul3
|
||||||
paddd xmm3, xmm4 // add 16 results
|
paddd xmm3, xmm4 // add 16 results
|
||||||
paddd xmm1, xmm2
|
paddd xmm1, xmm2
|
||||||
sub ecx, 16
|
sub ecx, 16
|
||||||
paddd xmm1, xmm3
|
paddd xmm1, xmm3
|
||||||
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
|
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
|
||||||
paddd xmm1, xmm2
|
paddd xmm1, xmm2
|
||||||
pshufd xmm2, xmm1, 0x01
|
pshufd xmm2, xmm1, 0x01
|
||||||
paddd xmm1, xmm2
|
paddd xmm1, xmm2
|
||||||
paddd xmm0, xmm1
|
paddd xmm0, xmm1
|
||||||
jg wloop
|
jg wloop
|
||||||
|
|
||||||
movd eax, xmm0 // return hash
|
movd eax, xmm0 // return hash
|
||||||
ret
|
ret
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif // _MSC_VER >= 1700
|
#endif // _MSC_VER >= 1700
|
||||||
|
|
||||||
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
} // namespace libyuv
|
} // namespace libyuv
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -49,17 +49,17 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
|
|||||||
ARGBToYRow_C;
|
ARGBToYRow_C;
|
||||||
void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
|
void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
|
||||||
int pix) = ARGBToUV444Row_C;
|
int pix) = ARGBToUV444Row_C;
|
||||||
#if defined(HAS_ARGBTOUV444ROW_SSSE3)
|
#if defined(HAS_ARGBTOUV444ROW_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
|
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
|
||||||
ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
|
ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
|
||||||
if (IS_ALIGNED(width, 16)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;
|
ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;
|
||||||
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
|
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
|
||||||
ARGBToUV444Row = ARGBToUV444Row_SSSE3;
|
ARGBToUV444Row = ARGBToUV444Row_SSSE3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_ARGBTOYROW_SSSE3)
|
#if defined(HAS_ARGBTOYROW_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
|
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
|
||||||
ARGBToYRow = ARGBToYRow_Any_SSSE3;
|
ARGBToYRow = ARGBToYRow_Any_SSSE3;
|
||||||
|
|||||||
@ -28,7 +28,7 @@ cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
|
|||||||
psrlw m2, m2, 8
|
psrlw m2, m2, 8
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
align 4
|
ALIGN 4
|
||||||
.convertloop:
|
.convertloop:
|
||||||
mov%2 m0, [src_yuy2q]
|
mov%2 m0, [src_yuy2q]
|
||||||
mov%2 m1, [src_yuy2q + mmsize]
|
mov%2 m1, [src_yuy2q + mmsize]
|
||||||
@ -74,7 +74,7 @@ cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
|
|||||||
psrlw m4, m4, 8
|
psrlw m4, m4, 8
|
||||||
sub dst_vq, dst_uq
|
sub dst_vq, dst_uq
|
||||||
|
|
||||||
align 4
|
ALIGN 4
|
||||||
.convertloop:
|
.convertloop:
|
||||||
mov%1 m0, [src_uvq]
|
mov%1 m0, [src_uvq]
|
||||||
mov%1 m1, [src_uvq + mmsize]
|
mov%1 m1, [src_uvq + mmsize]
|
||||||
@ -113,7 +113,7 @@ SplitUVRow a,
|
|||||||
cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
|
cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
|
||||||
sub src_vq, src_uq
|
sub src_vq, src_uq
|
||||||
|
|
||||||
align 4
|
ALIGN 4
|
||||||
.convertloop:
|
.convertloop:
|
||||||
mov%1 m0, [src_uq]
|
mov%1 m0, [src_uq]
|
||||||
mov%1 m1, [src_vq]
|
mov%1 m1, [src_vq]
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user