Make IEEEFloat::roundToIntegral more standard conformant

Behavior of IEEEFloat::roundToIntegral is aligned with IEEE-754
operation roundToIntegralExact. In partucular this function now:
- returns opInvalid for signaling NaNs,
- returns opInexact if the result of rounding differs from argument.

Differential Revision: https://reviews.llvm.org/D75246
This commit is contained in:
Serge Pavlov 2020-02-27 18:16:14 +07:00
parent c0ad75e758
commit 14a1b80e04
2 changed files with 171 additions and 9 deletions

View File

@ -1977,14 +1977,59 @@ IEEEFloat::opStatus IEEEFloat::fusedMultiplyAdd(const IEEEFloat &multiplicand,
return fs;
}
/* Rounding-mode corrrect round to integral value. */
/* Rounding-mode correct round to integral value. */
IEEEFloat::opStatus IEEEFloat::roundToIntegral(roundingMode rounding_mode) {
opStatus fs;
if (isInfinity())
// [IEEE Std 754-2008 6.1]:
// The behavior of infinity in floating-point arithmetic is derived from the
// limiting cases of real arithmetic with operands of arbitrarily
// large magnitude, when such a limit exists.
// ...
// Operations on infinite operands are usually exact and therefore signal no
// exceptions ...
return opOK;
if (isNaN()) {
if (isSignaling()) {
// [IEEE Std 754-2008 6.2]:
// Under default exception handling, any operation signaling an invalid
// operation exception and for which a floating-point result is to be
// delivered shall deliver a quiet NaN.
makeQuiet();
// [IEEE Std 754-2008 6.2]:
// Signaling NaNs shall be reserved operands that, under default exception
// handling, signal the invalid operation exception(see 7.2) for every
// general-computational and signaling-computational operation except for
// the conversions described in 5.12.
return opInvalidOp;
} else {
// [IEEE Std 754-2008 6.2]:
// For an operation with quiet NaN inputs, other than maximum and minimum
// operations, if a floating-point result is to be delivered the result
// shall be a quiet NaN which should be one of the input NaNs.
// ...
// Every general-computational and quiet-computational operation involving
// one or more input NaNs, none of them signaling, shall signal no
// exception, except fusedMultiplyAdd might signal the invalid operation
// exception(see 7.2).
return opOK;
}
}
if (isZero()) {
// [IEEE Std 754-2008 6.3]:
// ... the sign of the result of conversions, the quantize operation, the
// roundToIntegral operations, and the roundToIntegralExact(see 5.3.1) is
// the sign of the first or only operand.
return opOK;
}
// If the exponent is large enough, we know that this value is already
// integral, and the arithmetic below would potentially cause it to saturate
// to +/-Inf. Bail out early instead.
if (isFiniteNonZero() && exponent+1 >= (int)semanticsPrecision(*semantics))
if (exponent+1 >= (int)semanticsPrecision(*semantics))
return opOK;
// The algorithm here is quite simple: we add 2^(p-1), where p is the
@ -1998,19 +2043,18 @@ IEEEFloat::opStatus IEEEFloat::roundToIntegral(roundingMode rounding_mode) {
IEEEFloat MagicConstant(*semantics);
fs = MagicConstant.convertFromAPInt(IntegerConstant, false,
rmNearestTiesToEven);
assert(fs == opOK);
MagicConstant.sign = sign;
if (fs != opOK)
return fs;
// Preserve the input sign so that we can handle 0.0/-0.0 cases correctly.
// Preserve the input sign so that we can handle the case of zero result
// correctly.
bool inputSign = isNegative();
fs = add(MagicConstant, rounding_mode);
if (fs != opOK && fs != opInexact)
return fs;
fs = subtract(MagicConstant, rounding_mode);
// Current value and 'MagicConstant' are both integers, so the result of the
// subtraction is always exact according to Sterbenz' lemma.
subtract(MagicConstant, rounding_mode);
// Restore the input sign.
if (inputSign != isNegative())

View File

@ -1525,6 +1525,124 @@ TEST(APFloatTest, roundToIntegral) {
P = APFloat::getInf(APFloat::IEEEdouble(), true);
P.roundToIntegral(APFloat::rmTowardZero);
EXPECT_TRUE(std::isinf(P.convertToDouble()) && P.convertToDouble() < 0.0);
APFloat::opStatus St;
P = APFloat::getNaN(APFloat::IEEEdouble());
St = P.roundToIntegral(APFloat::rmTowardZero);
EXPECT_TRUE(P.isNaN());
EXPECT_FALSE(P.isNegative());
EXPECT_EQ(APFloat::opOK, St);
P = APFloat::getNaN(APFloat::IEEEdouble(), true);
St = P.roundToIntegral(APFloat::rmTowardZero);
EXPECT_TRUE(P.isNaN());
EXPECT_TRUE(P.isNegative());
EXPECT_EQ(APFloat::opOK, St);
P = APFloat::getSNaN(APFloat::IEEEdouble());
St = P.roundToIntegral(APFloat::rmTowardZero);
EXPECT_TRUE(P.isNaN());
EXPECT_FALSE(P.isSignaling());
EXPECT_FALSE(P.isNegative());
EXPECT_EQ(APFloat::opInvalidOp, St);
P = APFloat::getSNaN(APFloat::IEEEdouble(), true);
St = P.roundToIntegral(APFloat::rmTowardZero);
EXPECT_TRUE(P.isNaN());
EXPECT_FALSE(P.isSignaling());
EXPECT_TRUE(P.isNegative());
EXPECT_EQ(APFloat::opInvalidOp, St);
P = APFloat::getInf(APFloat::IEEEdouble());
St = P.roundToIntegral(APFloat::rmTowardZero);
EXPECT_TRUE(P.isInfinity());
EXPECT_FALSE(P.isNegative());
EXPECT_EQ(APFloat::opOK, St);
P = APFloat::getInf(APFloat::IEEEdouble(), true);
St = P.roundToIntegral(APFloat::rmTowardZero);
EXPECT_TRUE(P.isInfinity());
EXPECT_TRUE(P.isNegative());
EXPECT_EQ(APFloat::opOK, St);
P = APFloat::getZero(APFloat::IEEEdouble(), false);
St = P.roundToIntegral(APFloat::rmTowardZero);
EXPECT_TRUE(P.isZero());
EXPECT_FALSE(P.isNegative());
EXPECT_EQ(APFloat::opOK, St);
P = APFloat::getZero(APFloat::IEEEdouble(), false);
St = P.roundToIntegral(APFloat::rmTowardNegative);
EXPECT_TRUE(P.isZero());
EXPECT_FALSE(P.isNegative());
EXPECT_EQ(APFloat::opOK, St);
P = APFloat::getZero(APFloat::IEEEdouble(), true);
St = P.roundToIntegral(APFloat::rmTowardZero);
EXPECT_TRUE(P.isZero());
EXPECT_TRUE(P.isNegative());
EXPECT_EQ(APFloat::opOK, St);
P = APFloat::getZero(APFloat::IEEEdouble(), true);
St = P.roundToIntegral(APFloat::rmTowardNegative);
EXPECT_TRUE(P.isZero());
EXPECT_TRUE(P.isNegative());
EXPECT_EQ(APFloat::opOK, St);
P = APFloat(1E-100);
St = P.roundToIntegral(APFloat::rmTowardNegative);
EXPECT_TRUE(P.isZero());
EXPECT_FALSE(P.isNegative());
EXPECT_EQ(APFloat::opInexact, St);
P = APFloat(1E-100);
St = P.roundToIntegral(APFloat::rmTowardPositive);
EXPECT_EQ(1.0, P.convertToDouble());
EXPECT_FALSE(P.isNegative());
EXPECT_EQ(APFloat::opInexact, St);
P = APFloat(-1E-100);
St = P.roundToIntegral(APFloat::rmTowardNegative);
EXPECT_TRUE(P.isNegative());
EXPECT_EQ(-1.0, P.convertToDouble());
EXPECT_EQ(APFloat::opInexact, St);
P = APFloat(-1E-100);
St = P.roundToIntegral(APFloat::rmTowardPositive);
EXPECT_TRUE(P.isZero());
EXPECT_TRUE(P.isNegative());
EXPECT_EQ(APFloat::opInexact, St);
P = APFloat(10.0);
St = P.roundToIntegral(APFloat::rmTowardZero);
EXPECT_EQ(10.0, P.convertToDouble());
EXPECT_EQ(APFloat::opOK, St);
P = APFloat(10.5);
St = P.roundToIntegral(APFloat::rmTowardZero);
EXPECT_EQ(10.0, P.convertToDouble());
EXPECT_EQ(APFloat::opInexact, St);
P = APFloat(10.5);
St = P.roundToIntegral(APFloat::rmTowardPositive);
EXPECT_EQ(11.0, P.convertToDouble());
EXPECT_EQ(APFloat::opInexact, St);
P = APFloat(10.5);
St = P.roundToIntegral(APFloat::rmTowardNegative);
EXPECT_EQ(10.0, P.convertToDouble());
EXPECT_EQ(APFloat::opInexact, St);
P = APFloat(10.5);
St = P.roundToIntegral(APFloat::rmNearestTiesToAway);
EXPECT_EQ(11.0, P.convertToDouble());
EXPECT_EQ(APFloat::opInexact, St);
P = APFloat(10.5);
St = P.roundToIntegral(APFloat::rmNearestTiesToEven);
EXPECT_EQ(10.0, P.convertToDouble());
EXPECT_EQ(APFloat::opInexact, St);
}
TEST(APFloatTest, isInteger) {