libyuv/source/compare_win.cc
Frank Barchard 62c19d062d [libyuv] Remove all x86 SSE optimizations
Removed all SSE functions, macros, dispatching logic, and related
unit tests across the repository to reduce code size and complexity.
Left cpuid detection intact. Supported architectures like AVX2, NEON,
SVE, etc. are unaffected.

R=rrwinterton@gmail.com

Bug: None
Test: Build and run libyuv_unittest
Change-Id: Id19608dba35b79c4c8fc31f920a6a968883d300f
2026-04-29 16:56:03 -07:00

149 lines
4.1 KiB
C++

/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/row.h"
#if defined(_MSC_VER)
#include <intrin.h> // For __popcnt
#endif
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for 32 bit Visual C x86
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_IX86) && \
(!defined(__clang__) || defined(LIBYUV_ENABLE_ROWWIN))
__declspec(naked) uint32_t
#ifdef HAS_SUMSQUAREERROR_AVX2
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
#pragma warning(disable : 4752)
__declspec(naked) uint32_t
SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
__asm {
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
mov ecx, [esp + 12] // count
vpxor ymm0, ymm0, ymm0 // sum
vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
sub edx, eax
wloop:
vmovdqu ymm1, [eax]
vmovdqu ymm2, [eax + edx]
lea eax, [eax + 32]
vpsubusb ymm3, ymm1, ymm2 // abs difference trick
vpsubusb ymm2, ymm2, ymm1
vpor ymm1, ymm2, ymm3
vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
vpunpckhbw ymm1, ymm1, ymm5
vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
vpmaddwd ymm1, ymm1, ymm1
vpaddd ymm0, ymm0, ymm1
vpaddd ymm0, ymm0, ymm2
sub ecx, 32
jg wloop
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
vpaddd ymm0, ymm0, ymm1
vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
vpaddd ymm0, ymm0, ymm1
vpermq ymm1, ymm0, 0x02 // high + low lane.
vpaddd ymm0, ymm0, ymm1
vmovd eax, xmm0
vzeroupper
ret
}
}
#endif // HAS_SUMSQUAREERROR_AVX2
uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
};
uvec32 kHashMul1 = {
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
};
uvec32 kHashMul2 = {
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
};
uvec32 kHashMul3 = {
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
};
__declspec(naked) uint32_t
// Visual C 2012 required for AVX2.
#ifdef HAS_HASHDJB2_AVX2
__declspec(naked) uint32_t
HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
vmovd xmm0, [esp + 12] // seed
wloop:
vpmovzxbd xmm3, [eax] // src[0-3]
vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
vpmovzxbd xmm4, [eax + 4] // src[4-7]
vpmulld xmm3, xmm3, xmmword ptr kHashMul0
vpmovzxbd xmm2, [eax + 8] // src[8-11]
vpmulld xmm4, xmm4, xmmword ptr kHashMul1
vpmovzxbd xmm1, [eax + 12] // src[12-15]
vpmulld xmm2, xmm2, xmmword ptr kHashMul2
lea eax, [eax + 16]
vpmulld xmm1, xmm1, xmmword ptr kHashMul3
vpaddd xmm3, xmm3, xmm4 // add 16 results
vpaddd xmm1, xmm1, xmm2
vpaddd xmm1, xmm1, xmm3
vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
vpaddd xmm1, xmm1,xmm2
vpshufd xmm2, xmm1, 0x01
vpaddd xmm1, xmm1, xmm2
vpaddd xmm0, xmm0, xmm1
sub ecx, 16
jg wloop
vmovd eax, xmm0 // return hash
vzeroupper
ret
}
}
#endif // HAS_HASHDJB2_AVX2
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif