blob: 3e87eb090f6218403b229f88492b42334c464984 [file] [log] [blame]
.file "fmodl.s"
// Copyright (c) 2000 - 2004, Intel Corporation
// All rights reserved.
//
// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//====================================================================
// 02/02/00 Initial version
// 03/02/00 New Algorithm
// 04/04/00 Unwind support added
// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [ the previously overwritten ] GR_Parameter_RESULT.
// 11/28/00 Set FR_Y to f9
// 03/11/02 Fixed flags for fmodl(qnan, zero)
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header:.section,.global,.proc,.align
// 04/28/03 Fix: fmod(sNaN, 0) no longer sets errno
// 11/23/04 Reformatted routine and improved speed
//
// API
//====================================================================
// long double fmodl(long double, long double);
//
// Overview of operation
//====================================================================
// fmod(a, b)= a-i*b,
// where i is an integer such that, if b!= 0,
// |i|<|a/b| and |a/b-i|<1
//
// Algorithm
//====================================================================
// a). if |a|<|b|, return a
// b). get quotient and reciprocal overestimates accurate to
// 33 bits (q2, y2)
// c). if the exponent difference (exponent(a)-exponent(b))
// is less than 32, truncate quotient to integer and
// finish in one iteration
// d). if exponent(a)-exponent(b)>= 32 (q2>= 2^32)
// round quotient estimate to single precision (k= RN(q2)),
// calculate partial remainder (a'= a-k*b),
// get quotient estimate (a'*y2), and repeat from c).
//
// Registers used
//====================================================================
GR_SMALLBIASEXP = r2
GR_2P32 = r3
GR_SMALLBIASEXP = r20
GR_ROUNDCONST = r21
GR_SIG_B = r22
GR_ARPFS = r23
GR_TMP1 = r24
GR_TMP2 = r25
GR_TMP3 = r26
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
GR_SAVE_GP = r35
GR_SAVE_SP = r36
GR_Parameter_X = r37
GR_Parameter_Y = r38
GR_Parameter_RESULT = r39
GR_Parameter_TAG = r40
FR_X = f10
FR_Y = f9
FR_RESULT = f8
FR_ABS_A = f6
FR_ABS_B = f7
FR_Y_INV = f10
FR_SMALLBIAS = f11
FR_E0 = f12
FR_Q = f13
FR_E1 = f14
FR_2P32 = f15
FR_TMPX = f32
FR_TMPY = f33
FR_ROUNDCONST = f34
FR_QINT = f35
FR_QRND24 = f36
FR_NORM_B = f37
FR_TMP = f38
FR_TMP2 = f39
FR_DFLAG = f40
FR_Y_INV0 = f41
FR_Y_INV1 = f42
FR_Q0 = f43
FR_Q1 = f44
FR_QINT_Z = f45
FR_QREM = f46
FR_B_SGN_A = f47
.section .text
GLOBAL_IEEE754_ENTRY(fmodl)
// inputs in f8, f9
// result in f8
{ .mfi
getf.sig GR_SIG_B = f9
// FR_ABS_A = |a|
fmerge.s FR_ABS_A = f0, f8
mov GR_SMALLBIASEXP = 0x0ffdd
}
{ .mfi
nop.m 0
// FR_ABS_B = |b|
fmerge.s FR_ABS_B = f0, f9
nop.i 0
}
;;
{ .mfi
setf.exp FR_SMALLBIAS = GR_SMALLBIASEXP
// (1) y0
frcpa.s1 FR_Y_INV0, p6 = FR_ABS_A, FR_ABS_B
nop.i 0
}
;;
{ .mlx
nop.m 0
movl GR_ROUNDCONST = 0x33a00000
}
;;
// eliminate special cases
{ .mmi
nop.m 0
nop.m 0
// y pseudo-zero ?
cmp.eq p7, p10 = GR_SIG_B, r0
}
;;
// set p7 if b +/-NAN, +/-inf, +/-0
{ .mfi
nop.m 0
(p10) fclass.m p7, p10 = f9, 0xe7
nop.i 0
}
;;
{ .mfi
mov GR_2P32 = 0x1001f
// (2) q0 = a*y0
(p6) fma.s1 FR_Q0 = FR_ABS_A, FR_Y_INV0, f0
nop.i 0
}
{ .mfi
nop.m 0
// (3) e0 = 1 - b * y0
(p6) fnma.s1 FR_E0 = FR_ABS_B, FR_Y_INV0, f1
nop.i 0
}
;;
// set p9 if a +/-NAN, +/-inf
{ .mfi
nop.m 0
fclass.m.unc p9, p11 = f8, 0xe3
nop.i 0
}
// |a| < |b|? Return a, p8=1
{ .mfi
nop.m 0
(p10) fcmp.lt.unc.s1 p8, p0 = FR_ABS_A, FR_ABS_B
nop.i 0
}
;;
// set p7 if b +/-NAN, +/-inf, +/-0
{ .mfi
nop.m 0
// pseudo-NaN ?
(p10) fclass.nm p7, p0 = f9, 0xff
nop.i 0
}
;;
// set p9 if a is +/-NaN, +/-Inf
{ .mfi
nop.m 0
(p11) fclass.nm p9, p0 = f8, 0xff
nop.i 0
}
{ .mfi
nop.m 0
// b denormal ? set D flag (if |a|<|b|)
(p8) fnma.s0 FR_DFLAG = f9, f1, f9
nop.i 0
}
;;
{ .mfi
// FR_2P32 = 2^32
setf.exp FR_2P32 = GR_2P32
// (4) q1 = q0+e0*q0
(p6) fma.s1 FR_Q1 = FR_E0, FR_Q0, FR_Q0
nop.i 0
}
{ .mfi
nop.m 0
// (5) e1 = e0 * e0 + 2^-34
(p6) fma.s1 FR_E1 = FR_E0, FR_E0, FR_SMALLBIAS
nop.i 0
}
;;
{ .mfi
nop.m 0
// normalize a (if |a|<|b|)
(p8) fma.s0 f8 = f8, f1, f0
nop.i 0
}
{ .bbb
(p9) br.cond.spnt FMOD_A_NAN_INF
(p7) br.cond.spnt FMOD_B_NAN_INF_ZERO
// if |a|<|b|, return
(p8) br.ret.spnt b0
}
;;
{ .mfi
nop.m 0
// (6) y1 = y0 + e0 * y0
(p6) fma.s1 FR_Y_INV1 = FR_E0, FR_Y_INV0, FR_Y_INV0
nop.i 0
}
;;
{ .mfi
nop.m 0
// a denormal ? set D flag
// b denormal ? set D flag
fcmp.eq.s0 p12,p0 = FR_ABS_A, FR_ABS_B
nop.i 0
}
{ .mfi
// set FR_ROUNDCONST = 1.25*2^{-24}
setf.s FR_ROUNDCONST = GR_ROUNDCONST
// (7) q2 = q1+e1*q1
(p6) fma.s1 FR_Q = FR_Q1, FR_E1, FR_Q1
nop.i 0
}
;;
{ .mfi
nop.m 0
fmerge.s FR_B_SGN_A = f8, f9
nop.i 0
}
{ .mfi
nop.m 0
// (8) y2 = y1 + e1 * y1
(p6) fma.s1 FR_Y_INV = FR_E1, FR_Y_INV1, FR_Y_INV1
// set p6 = 0, p10 = 0
cmp.ne.and p6, p10 = r0, r0
}
;;
// will compute integer quotient bits (24 bits per iteration)
.align 32
loop64:
{ .mfi
nop.m 0
// compare q2, 2^32
fcmp.lt.unc.s1 p8, p7 = FR_Q, FR_2P32
nop.i 0
}
{ .mfi
nop.m 0
// will truncate quotient to integer, if exponent<32 (in advance)
fcvt.fx.trunc.s1 FR_QINT = FR_Q
nop.i 0
}
;;
{ .mfi
nop.m 0
// if exponent>32 round quotient to single precision (perform in advance)
fma.s.s1 FR_QRND24 = FR_Q, f1, f0
nop.i 0
}
;;
{ .mfi
nop.m 0
// set FR_ROUNDCONST = sgn(a)
(p8) fmerge.s FR_ROUNDCONST = f8, f1
nop.i 0
}
{ .mfi
nop.m 0
// normalize truncated quotient
(p8) fcvt.xf FR_QRND24 = FR_QINT
nop.i 0
}
;;
{ .mfi
nop.m 0
// calculate remainder (assuming FR_QRND24 = RZ(Q))
(p7) fnma.s1 FR_E1 = FR_QRND24, FR_ABS_B, FR_ABS_A
nop.i 0
}
{ .mfi
nop.m 0
// also if exponent>32, round quotient to single precision
// and subtract 1 ulp: q = q-q*(1.25*2^{-24})
(p7) fnma.s.s1 FR_QINT_Z = FR_QRND24, FR_ROUNDCONST, FR_QRND24
nop.i 0
}
;;
{ .mfi
nop.m 0
// (p8) calculate remainder (82-bit format)
(p8) fnma.s1 FR_QREM = FR_QRND24, FR_ABS_B, FR_ABS_A
nop.i 0
}
{ .mfi
nop.m 0
// (p7) calculate remainder (assuming FR_QINT_Z = RZ(Q))
(p7) fnma.s1 FR_ABS_A = FR_QINT_Z, FR_ABS_B, FR_ABS_A
nop.i 0
}
;;
{ .mfi
nop.m 0
// Final iteration (p8): is FR_ABS_A the correct remainder
// (quotient was not overestimated) ?
(p8) fcmp.lt.unc.s1 p6, p10 = FR_QREM, f0
nop.i 0
}
;;
{ .mfi
nop.m 0
// get new quotient estimation: a'*y2
(p7) fma.s1 FR_Q = FR_E1, FR_Y_INV, f0
nop.i 0
}
{ .mfb
nop.m 0
// was FR_Q = RZ(Q) ? (then new remainder FR_E1> = 0)
(p7) fcmp.lt.unc.s1 p7, p9 = FR_E1, f0
nop.b 0
}
;;
.pred.rel "mutex", p6, p10
{ .mfb
nop.m 0
// add b to estimated remainder (to cover the case when the quotient was
// overestimated)
// also set correct sign by using
// FR_B_SGN_A = |b|*sgn(a), FR_ROUNDCONST = sgn(a)
(p6) fma.s0 f8 = FR_QREM, FR_ROUNDCONST, FR_B_SGN_A
nop.b 0
}
{ .mfb
nop.m 0
// set correct sign of result before returning: FR_ROUNDCONST = sgn(a)
(p10) fma.s0 f8 = FR_QREM, FR_ROUNDCONST, f0
(p8) br.ret.sptk b0
}
;;
{ .mfi
nop.m 0
// if f13! = RZ(Q), get alternative quotient estimation: a''*y2
(p7) fma.s1 FR_Q = FR_ABS_A, FR_Y_INV, f0
nop.i 0
}
{ .mfb
nop.m 0
// if FR_E1 was RZ(Q), set remainder to FR_E1
(p9) fma.s1 FR_ABS_A = FR_E1, f1, f0
br.cond.sptk loop64
}
;;
FMOD_A_NAN_INF:
// b zero ?
{ .mfi
nop.m 0
fclass.m p10, p0 = f8, 0xc3 // Test a = nan
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 FR_NORM_B = f9, f1, f0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s0 f8 = f8, f1, f0
nop.i 0
}
{ .mfi
nop.m 0
(p10) fclass.m p10, p0 = f9, 0x07 // Test x = nan, and y = zero
nop.i 0
}
;;
{ .mfb
nop.m 0
fcmp.eq.unc.s1 p11, p0 = FR_NORM_B, f0
(p10) br.ret.spnt b0 // Exit with result = a if a = nan and b = zero
}
;;
{ .mib
nop.m 0
nop.i 0
// if Y zero
(p11) br.cond.spnt FMOD_B_ZERO
}
;;
// a= infinity? Return QNAN indefinite
{ .mfi
// set p7 t0 0
cmp.ne p7, p0 = r0, r0
fclass.m.unc p8, p9 = f8, 0x23
nop.i 0
}
;;
// b NaN ?
{ .mfi
nop.m 0
(p8) fclass.m p9, p8 = f9, 0xc3
nop.i 0
}
;;
// b not pseudo-zero ? (GR_SIG_B holds significand)
{ .mii
nop.m 0
(p8) cmp.ne p7, p0 = GR_SIG_B, r0
nop.i 0
}
;;
{ .mfi
nop.m 0
(p8) frcpa.s0 f8, p0 = f8, f8
nop.i 0
}
{ .mfi
nop.m 0
// also set Denormal flag if necessary
(p7) fnma.s0 f9 = f9, f1, f9
nop.i 0
}
;;
{ .mfb
nop.m 0
(p8) fma.s0 f8 = f8, f1, f0
nop.b 0
}
;;
{ .mfb
nop.m 0
(p9) frcpa.s0 f8, p7 = f8, f9
br.ret.sptk b0
}
;;
FMOD_B_NAN_INF_ZERO:
// b INF
{ .mfi
nop.m 0
fclass.m.unc p7, p0 = f9, 0x23
nop.i 0
}
;;
{ .mfb
nop.m 0
(p7) fma.s0 f8 = f8, f1, f0
(p7) br.ret.spnt b0
}
;;
// b NAN?
{ .mfi
nop.m 0
fclass.m.unc p9, p10 = f9, 0xc3
nop.i 0
}
;;
{ .mfi
nop.m 0
(p10) fclass.nm p9, p0 = f9, 0xff
nop.i 0
}
;;
{ .mfb
nop.m 0
(p9) fma.s0 f8 = f9, f1, f0
(p9) br.ret.spnt b0
}
;;
FMOD_B_ZERO:
// Y zero? Must be zero at this point
// because it is the only choice left.
// Return QNAN indefinite
{ .mfi
nop.m 0
// set Invalid
frcpa.s0 FR_TMP, p0 = f0, f0
nop.i 0
}
;;
// a NAN?
{ .mfi
nop.m 0
fclass.m.unc p9, p10 = f8, 0xc3
nop.i 0
}
;;
{ .mfi
alloc GR_ARPFS = ar.pfs, 1, 4, 4, 0
(p10) fclass.nm p9, p10 = f8, 0xff
nop.i 0
}
;;
{ .mfi
nop.m 0
(p9) frcpa.s0 FR_TMP2, p7 = f8, f0
nop.i 0
}
;;
{ .mfi
nop.m 0
(p10) frcpa.s0 FR_TMP2, p7 = f9, f9
mov GR_Parameter_TAG = 120
}
;;
{ .mfi
nop.m 0
fmerge.s FR_X = f8, f8
nop.i 0
}
{ .mfb
nop.m 0
fma.s0 f8 = FR_TMP2, f1, f0
br.sptk __libm_error_region
}
;;
GLOBAL_IEEE754_END(fmodl)
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y = -32, sp // Parameter 2 value
nop.f 0
.save ar.pfs, GR_SAVE_PFS
mov GR_SAVE_PFS = ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
add sp = -64, sp // Create new stack
nop.f 0
mov GR_SAVE_GP = gp // Save gp
}
;;
{ .mmi
stfe [ GR_Parameter_Y ] = FR_Y, 16 // Save Parameter 2 on stack
add GR_Parameter_X = 16, sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0 = b0 // Save b0
}
;;
.body
{ .mib
stfe [ GR_Parameter_X ] = FR_X // Store Parameter 1 on stack
add GR_Parameter_RESULT = 0, GR_Parameter_Y
nop.b 0 // Parameter 3 address
}
{ .mib
stfe [ GR_Parameter_Y ] = FR_RESULT // Store Parameter 3 on stack
add GR_Parameter_Y = -16, GR_Parameter_Y
br.call.sptk b0 = __libm_error_support# // Call error handling function
}
;;
{ .mmi
nop.m 0
nop.m 0
add GR_Parameter_RESULT = 48, sp
}
;;
{ .mmi
ldfe f8 = [ GR_Parameter_RESULT ] // Get return result off stack
.restore sp
add sp = 64, sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
}
;;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
}
;;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#, @function
.global __libm_error_support#