Merge pull request #153 from fastfloat/dlemire/renabling_clinger

Conditional Clinger's fast path
This commit is contained in:
Daniel Lemire 2022-11-23 10:36:55 -05:00 committed by GitHub
commit 8f092d2799
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 331 additions and 60 deletions

18
.github/workflows/ubuntu20-fastmath.yml vendored Normal file
View File

@ -0,0 +1,18 @@
name: Ubuntu 20.04 CI (GCC 9, fast-math)
on: [push, pull_request]
jobs:
ubuntu-build:
runs-on: ubuntu-20.04
strategy:
fail-fast: false
steps:
- uses: actions/checkout@v3
- name: Use cmake
run: |
mkdir build &&
cd build &&
cmake -DCMAKE_CXX_FLAGS="-ffast-math" -DFASTFLOAT_TEST=ON .. &&
cmake --build . &&
ctest --output-on-failure

View File

@ -17,7 +17,7 @@ namespace fast_float {
// we might have platforms where `CHAR_BIT` is not 8, so let's avoid
// doing `8 * sizeof(limb)`.
#if defined(FASTFLOAT_64BIT) && !defined(__sparc)
#define FASTFLOAT_64BIT_LIMB
#define FASTFLOAT_64BIT_LIMB 1
typedef uint64_t limb;
constexpr size_t limb_bits = 64;
#else

View File

@ -12,11 +12,11 @@
|| defined(__MINGW64__) \
|| defined(__s390x__) \
|| (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)) )
#define FASTFLOAT_64BIT
#define FASTFLOAT_64BIT 1
#elif (defined(__i386) || defined(__i386__) || defined(_M_IX86) \
|| defined(__arm__) || defined(_M_ARM) \
|| defined(__MINGW32__) || defined(__EMSCRIPTEN__))
#define FASTFLOAT_32BIT
#define FASTFLOAT_32BIT 1
#else
// Need to check incrementally, since SIZE_MAX is a size_t, avoid overflow.
// We can never tell the register width, but the SIZE_MAX is a good approximation.
@ -24,9 +24,9 @@
#if SIZE_MAX == 0xffff
#error Unknown platform (16-bit, unsupported)
#elif SIZE_MAX == 0xffffffff
#define FASTFLOAT_32BIT
#define FASTFLOAT_32BIT 1
#elif SIZE_MAX == 0xffffffffffffffff
#define FASTFLOAT_64BIT
#define FASTFLOAT_64BIT 1
#else
#error Unknown platform (not 32-bit, not 64-bit?)
#endif
@ -272,10 +272,12 @@ template <typename T> struct binary_format {
static inline constexpr int minimum_exponent();
static inline constexpr int infinite_power();
static inline constexpr int sign_index();
static inline constexpr int min_exponent_fast_path(); // used when fegetround() == FE_TONEAREST
static inline constexpr int max_exponent_fast_path();
static inline constexpr int max_exponent_round_to_even();
static inline constexpr int min_exponent_round_to_even();
static inline constexpr uint64_t max_mantissa_fast_path(int64_t power);
static inline constexpr uint64_t max_mantissa_fast_path(); // used when fegetround() == FE_TONEAREST
static inline constexpr int largest_power_of_ten();
static inline constexpr int smallest_power_of_ten();
static inline constexpr T exact_power_of_ten(int64_t power);
@ -285,6 +287,22 @@ template <typename T> struct binary_format {
static inline constexpr equiv_uint hidden_bit_mask();
};
template <> inline constexpr int binary_format<double>::min_exponent_fast_path() {
#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
return 0;
#else
return -22;
#endif
}
template <> inline constexpr int binary_format<float>::min_exponent_fast_path() {
#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
return 0;
#else
return -10;
#endif
}
template <> inline constexpr int binary_format<double>::mantissa_explicit_bits() {
return 52;
}
@ -331,13 +349,18 @@ template <> inline constexpr int binary_format<double>::max_exponent_fast_path()
template <> inline constexpr int binary_format<float>::max_exponent_fast_path() {
return 10;
}
template <> inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path() {
return uint64_t(2) << mantissa_explicit_bits();
}
template <> inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path(int64_t power) {
// caller is responsible to ensure that
// power >= 0 && power <= 22
//
return max_mantissa_double[power];
}
template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path() {
return uint64_t(2) << mantissa_explicit_bits();
}
template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path(int64_t power) {
// caller is responsible to ensure that
// power >= 0 && power <= 10

View File

@ -60,6 +60,48 @@ from_chars_result parse_infnan(const char *first, const char *last, T &value) n
return answer;
}
/**
* Returns true if the floating-pointing rounding mode is to 'nearest'.
* It is the default on most system. This function is meant to be inexpensive.
* Credit : @mwalcott3
*/
fastfloat_really_inline bool rounds_to_nearest() noexcept {
// See
// A fast function to check your floating-point rounding mode
// https://lemire.me/blog/2022/11/16/a-fast-function-to-check-your-floating-point-rounding-mode/
//
// This function is meant to be equivalent to :
// prior: #include <cfenv>
// return fegetround() == FE_TONEAREST;
// However, it is expected to be much faster than the fegetround()
// function call.
//
// The volatile keywoard prevents the compiler from computing the function
// at compile-time.
// There might be other ways to prevent compile-time optimizations (e.g., asm).
// The value does not need to be std::numeric_limits<float>::min(), any small
// value so that 1 + x should round to 1 would do (after accounting for excess
// precision, as in 387 instructions).
static volatile float fmin = std::numeric_limits<float>::min();
float fmini = fmin; // we copy it so that it gets loaded at most once.
//
// Explanation:
// Only when fegetround() == FE_TONEAREST do we have that
// fmin + 1.0f == 1.0f - fmin.
//
// FE_UPWARD:
// fmin + 1.0f > 1
// 1.0f - fmin == 1
//
// FE_DOWNWARD or FE_TOWARDZERO:
// fmin + 1.0f == 1
// 1.0f - fmin < 1
//
// Note: This may fail to be accurate if fast-math has been
// enabled, as rounding conventions may not apply.
return (fmini + 1.0f == 1.0f - fmini);
}
} // namespace detail
template<typename T>
@ -87,13 +129,46 @@ from_chars_result from_chars_advanced(const char *first, const char *last,
}
answer.ec = std::errc(); // be optimistic
answer.ptr = pns.lastmatch;
// Next is a modified Clinger's fast path, inspired by Jakub Jelínek's proposal
if (pns.exponent >= 0 && pns.exponent <= binary_format<T>::max_exponent_fast_path() && pns.mantissa <=binary_format<T>::max_mantissa_fast_path(pns.exponent) && !pns.too_many_digits) {
// The implementation of the Clinger's fast path is convoluted because
// we want round-to-nearest in all cases, irrespective of the rounding mode
// selected on the thread.
// We proceed optimistically, assuming that detail::rounds_to_nearest() returns
// true.
if (binary_format<T>::min_exponent_fast_path() <= pns.exponent && pns.exponent <= binary_format<T>::max_exponent_fast_path() && !pns.too_many_digits) {
// Unfortunately, the conventional Clinger's fast path is only possible
// when the system rounds to the nearest float.
//
// We expect the next branch to almost always be selected.
// We could check it first (before the previous branch), but
// there might be performance advantages at having the check
// be last.
if(detail::rounds_to_nearest()) {
// We have that fegetround() == FE_TONEAREST.
// Next is Clinger's fast path.
if (pns.mantissa <=binary_format<T>::max_mantissa_fast_path()) {
value = T(pns.mantissa);
value = value * binary_format<T>::exact_power_of_ten(pns.exponent);
if (pns.exponent < 0) { value = value / binary_format<T>::exact_power_of_ten(-pns.exponent); }
else { value = value * binary_format<T>::exact_power_of_ten(pns.exponent); }
if (pns.negative) { value = -value; }
return answer;
}
} else {
// We do not have that fegetround() == FE_TONEAREST.
// Next is a modified Clinger's fast path, inspired by Jakub Jelínek's proposal
if (pns.exponent >= 0 && pns.mantissa <=binary_format<T>::max_mantissa_fast_path(pns.exponent)) {
#if (defined(_WIN32) && defined(__clang__))
// ClangCL may map 0 to -0.0 when fegetround() == FE_DOWNWARD
if(pns.mantissa == 0) {
value = 0;
return answer;
}
#endif
value = T(pns.mantissa) * binary_format<T>::exact_power_of_ten(pns.exponent);
if (pns.negative) { value = -value; }
return answer;
}
}
}
adjusted_mantissa am = compute_float<binary_format<T>>(pns.exponent, pns.mantissa);
if(pns.too_many_digits && am.power2 >= 0) {
if(am != compute_float<binary_format<T>>(pns.exponent, pns.mantissa + 1)) {

View File

@ -9,7 +9,7 @@ option(SYSTEM_DOCTEST "Use system copy of doctest" OFF)
if (NOT SYSTEM_DOCTEST)
FetchContent_Declare(doctest
GIT_REPOSITORY https://github.com/onqtam/doctest.git
GIT_TAG 2.4.6)
GIT_TAG v2.4.9)
endif()
FetchContent_Declare(supplemental_test_files
GIT_REPOSITORY https://github.com/fastfloat/supplemental_test_files.git

View File

@ -10,6 +10,7 @@
#include <limits>
#include <string>
#include <system_error>
#include <cfenv>
#ifndef SUPPLEMENTAL_TEST_DATA_DIR
#define SUPPLEMENTAL_TEST_DATA_DIR "data/"
@ -42,6 +43,145 @@
#define FASTFLOAT_ODDPLATFORM 1
#endif
#define iHexAndDec(v) std::hex << "0x" << (v) << " (" << std::dec << (v) << ")"
#define fHexAndDec(v) std::hexfloat << (v) << " (" << std::defaultfloat << (v) << ")"
const char * round_name(int d) {
switch(d) {
case FE_UPWARD:
return "FE_UPWARD";
case FE_DOWNWARD:
return "FE_DOWNWARD";
case FE_TOWARDZERO:
return "FE_TOWARDZERO";
case FE_TONEAREST:
return "FE_TONEAREST";
default:
return "UNKNOWN";
}
}
#define FASTFLOAT_STR(x) #x
#define SHOW_DEFINE(x) printf("%s='%s'\n", #x, FASTFLOAT_STR(x))
TEST_CASE("system_info") {
std::cout << "system info:" << std::endl;
#ifdef _MSC_VER
SHOW_DEFINE(_MSC_VER);
#endif
#ifdef FASTFLOAT_64BIT_LIMB
SHOW_DEFINE(FASTFLOAT_64BIT_LIMB);
#endif
#ifdef __clang__
SHOW_DEFINE(__clang__);
#endif
#ifdef FASTFLOAT_VISUAL_STUDIO
SHOW_DEFINE(FASTFLOAT_VISUAL_STUDIO);
#endif
#ifdef FASTFLOAT_IS_BIG_ENDIAN
#if FASTFLOAT_IS_BIG_ENDIAN
printf("big endian\n");
#else
printf("little endian\n");
#endif
#endif
#ifdef FASTFLOAT_32BIT
SHOW_DEFINE(FASTFLOAT_32BIT);
#endif
#ifdef FASTFLOAT_64BIT
SHOW_DEFINE(FASTFLOAT_64BIT);
#endif
#ifdef FLT_EVAL_METHOD
SHOW_DEFINE(FLT_EVAL_METHOD);
#endif
#ifdef _WIN32
SHOW_DEFINE(_WIN32);
#endif
#ifdef _WIN64
SHOW_DEFINE(_WIN64);
#endif
std::cout << "fegetround() = " << round_name(fegetround()) << std::endl;
std::cout << std::endl;
}
TEST_CASE("rounds_to_nearest") {
//
// If this function fails, we may be left in a non-standard rounding state.
//
static volatile float fmin = std::numeric_limits<float>::min();
fesetround(FE_UPWARD);
std::cout << "FE_UPWARD: fmin + 1.0f = " << iHexAndDec(fmin + 1.0f) << " 1.0f - fmin = " << iHexAndDec(1.0f - fmin) << std::endl;
CHECK(fegetround() == FE_UPWARD);
CHECK(fast_float::detail::rounds_to_nearest() == false);
fesetround(FE_DOWNWARD);
std::cout << "FE_DOWNWARD: fmin + 1.0f = " << iHexAndDec(fmin + 1.0f) << " 1.0f - fmin = " << iHexAndDec(1.0f - fmin) << std::endl;
CHECK(fegetround() == FE_DOWNWARD);
CHECK(fast_float::detail::rounds_to_nearest() == false);
fesetround(FE_TOWARDZERO);
std::cout << "FE_TOWARDZERO: fmin + 1.0f = " << iHexAndDec(fmin + 1.0f) << " 1.0f - fmin = " << iHexAndDec(1.0f - fmin) << std::endl;
CHECK(fegetround() == FE_TOWARDZERO);
CHECK(fast_float::detail::rounds_to_nearest() == false);
fesetround(FE_TONEAREST);
std::cout << "FE_TONEAREST: fmin + 1.0f = " << iHexAndDec(fmin + 1.0f) << " 1.0f - fmin = " << iHexAndDec(1.0f - fmin) << std::endl;
CHECK(fegetround() == FE_TONEAREST);
CHECK(fast_float::detail::rounds_to_nearest() == true);
}
TEST_CASE("parse_zero") {
//
// If this function fails, we may be left in a non-standard rounding state.
//
const char * zero = "0";
uint64_t float64_parsed;
double f = 0;
::memcpy(&float64_parsed, &f, sizeof(f));
CHECK(float64_parsed == 0);
fesetround(FE_UPWARD);
auto r1 = fast_float::from_chars(zero, zero + 1, f);
CHECK(r1.ec == std::errc());
std::cout << "FE_UPWARD parsed zero as " << iHexAndDec(f) << std::endl;
CHECK(f == 0);
::memcpy(&float64_parsed, &f, sizeof(f));
std::cout << "double as uint64_t is " << float64_parsed << std::endl;
CHECK(float64_parsed == 0);
fesetround(FE_TOWARDZERO);
auto r2 = fast_float::from_chars(zero, zero + 1, f);
CHECK(r2.ec == std::errc());
std::cout << "FE_TOWARDZERO parsed zero as " << iHexAndDec(f) << std::endl;
CHECK(f == 0);
::memcpy(&float64_parsed, &f, sizeof(f));
std::cout << "double as uint64_t is " << float64_parsed << std::endl;
CHECK(float64_parsed == 0);
fesetround(FE_DOWNWARD);
auto r3 = fast_float::from_chars(zero, zero + 1, f);
CHECK(r3.ec == std::errc());
std::cout << "FE_DOWNWARD parsed zero as " << iHexAndDec(f) << std::endl;
CHECK(f == 0);
::memcpy(&float64_parsed, &f, sizeof(f));
std::cout << "double as uint64_t is " << float64_parsed << std::endl;
CHECK(float64_parsed == 0);
fesetround(FE_TONEAREST);
auto r4 = fast_float::from_chars(zero, zero + 1, f);
CHECK(r4.ec == std::errc());
std::cout << "FE_TONEAREST parsed zero as " << iHexAndDec(f) << std::endl;
CHECK(f == 0);
::memcpy(&float64_parsed, &f, sizeof(f));
std::cout << "double as uint64_t is " << float64_parsed << std::endl;
CHECK(float64_parsed == 0);
}
// C++ 17 because it is otherwise annoying to browse all files in a directory.
// We also only run these tests on little endian systems.
#if (FASTFLOAT_CPLUSPLUS >= 201703L) && (FASTFLOAT_IS_BIG_ENDIAN == 0) && !defined(FASTFLOAT_ODDPLATFORM)
@ -50,9 +190,16 @@
#include <filesystem>
#include <charconv>
// return true on success
bool check_file(std::string file_name) {
std::cout << "Checking " << file_name << std::endl;
// We check all rounding directions, for each file.
std::vector<int> directions = {FE_UPWARD, FE_DOWNWARD, FE_TOWARDZERO, FE_TONEAREST};
for (int d : directions) {
std::cout << "fesetround to " << round_name(d) << std::endl;
fesetround(d);
size_t number{0};
std::fstream newfile(file_name, std::ios::in);
if (newfile.is_open()) {
@ -88,10 +235,18 @@ bool check_file(std::string file_name) {
// Compare with expected results
if (float32_parsed != float32) {
std::cout << "bad 32 " << str << std::endl;
std::cout << "parsed as " << iHexAndDec(parsed_32) << std::endl;
std::cout << "as raw uint32_t, parsed = " << float32_parsed << ", expected = " << float32 << std::endl;
std::cout << "fesetround: " << round_name(d) << std::endl;
fesetround(FE_TONEAREST);
return false;
}
if (float64_parsed != float64) {
std::cout << "bad 64 " << str << std::endl;
std::cout << "parsed as " << iHexAndDec(parsed_64) << std::endl;
std::cout << "as raw uint64_t, parsed = " << float64_parsed << ", expected = " << float64 << std::endl;
std::cout << "fesetround: " << round_name(d) << std::endl;
fesetround(FE_TONEAREST);
return false;
}
number++;
@ -101,8 +256,11 @@ bool check_file(std::string file_name) {
newfile.close(); // close the file object
} else {
std::cout << "Could not read " << file_name << std::endl;
fesetround(FE_TONEAREST);
return false;
}
}
fesetround(FE_TONEAREST);
return true;
}
@ -125,9 +283,6 @@ TEST_CASE("leading_zeroes") {
CHECK(fast_float::leading_zeroes(bit << 63) == 0);
}
#define iHexAndDec(v) std::hex << "0x" << (v) << " (" << std::dec << (v) << ")"
#define fHexAndDec(v) std::hexfloat << (v) << " (" << std::defaultfloat << (v) << ")"
void test_full_multiplication(uint64_t lhs, uint64_t rhs, uint64_t expected_lo, uint64_t expected_hi) {
fast_float::value128 v;
v = fast_float::full_multiplication(lhs, rhs);