sysdeps/ia64/fpu/s_tanhf.S - nest-cam/v366/glibc - Git at Google

 .file "tanhf.s"


 // Copyright (c) 2001 - 2005, Intel Corporation
 // All rights reserved.
 //
 // Contributed 2001 by the Intel Numerics Group, Intel Corporation
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 // * Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //
 // * Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 //
 // * The name of Intel Corporation may not be used to endorse or promote
 // products derived from this software without specific prior written
 // permission.

 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Intel Corporation is the author of this code, and requests that all
 // problem reports or change requests be submitted to it directly at
 // http://www.intel.com/software/products/opensource/libraries/num.htm.
 //
 // History
 //==============================================================
 // 05/30/01 Initial version
 // 05/20/02 Cleaned up namespace and sf0 syntax
 // 02/10/03 Reordered header: .section, .global, .proc, .align
 // 03/31/05 Reformatted delimiters between data tables
 //
 // API
 //==============================================================
 // float tanhf(float)
 //
 // Overview of operation
 //==============================================================
 // Background
 //
 //
 // There are 9 paths:
 // 1. x = +/-0.0
 //    Return tanhf(x) = +/-0.0
 //
 // 2. 0.0 < |x| < 0.3125
 //    Return tanhf(x) = x + x^3*Pol3(x^2),
 //    where Pol3(x^2) = C3*x^6 + C2*x^4 + C1*x^2 + C0
 //
 // 3. 0.3125 <= |x| < 8.0
 //    Return tanhf(x) = sign(x)*PolD(x)*PolC(|x|) + sign(x)*PolA(|x|),
 //    where sign(x)*PolD(x) = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4),
 //          PolC(|x|) = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0,
 //          PolA(|x|) = A3|x|^3 + A2*x^2 + A1*|x| + A0
 //
 //    Actually range 0.3125<=|x|< 8.0 is split to 5 subranges.
 //    For each subrange there is particular set of coefficients.
 //    Below is the list of subranges:
 //    3.1 0.3125 <= |x| < 0.5
 //    3.2 0.5 <= |x| < 1.0
 //    3.3 1.0 <= |x| < 2.0
 //    3.4 2.0 <= |x| < 4.0
 //    3.5 4.0 <= |x| < 8.0
 //
 // 4. 8.0 <= |x| < 9.125
 //    Return tanhf(x) = sign(x)*(A3|x|^3 + A2*x^2 + A1*|x| + A0)
 //
 // 5. 9.125 <= |x| < +INF
 //    Return tanhf(x) = sign(x)*(1.0d - 2^(-52))
 //
 // 6. |x| = INF
 //    Return tanhf(x) = sign(x) * 1.0
 //
 // 7. x = [S,Q]NaN
 //    Return tanhf(x) = QNaN
 //
 // 8. x is positive denormal
 //    Return tanhf(x) = x - x^2
 //
 // 9. x is negative denormal
 //    Return tanhf(x) = x + x^2
 //
 // Registers used
 //==============================================================
 // Floating Point registers used:
 // f8, input
 // f32 -> f59

 // General registers used:
 // r32 -> r46, r2, r3

 // Predicate registers used:
 // p0, p6 -> p15

 // p6           to filter out case when x = [Q,S]NaN or +/-0
 // p7           to filter out case when x = denormal
 // p8           set if |x| >= 0.3125, used also to process denormal input
 // p9           to filter out case when |x| = inf
 // p10          to filter out case when |x| < 0.3125
 // p11          to filter out case when 0.3125 <= |x| < 9.125
 // p12          to filter out case when |x| >= 9.125
 // p13          to filter out case when 8.0 <= |x| < 9.125
 // p14          set to 1 for positive x
 // p15          set to 1 for negative x

 // Assembly macros
 //==============================================================
 rDataPtr           = r2
 rDataPtr1          = r3

 rBias              = r33
 rCoeffAddr3        = r34
 rNearSaturation    = r35
 rCoeffAddr1        = r36
 rCoeffAddr2        = r37
 rOffset2           = r38
 rBias2             = r39
 rMask              = r40
 rArg               = r41
 rBound             = r42
 rSignBit           = r43
 rAbsArg            = r44
 rDataPtr2          = r45
 rSaturation        = r46

 //==============================================================
 fA0                = f32
 fA1                = f33
 fA2                = f34
 fA3                = f35
 fC0                = f36
 fC1                = f37
 fC2                = f38
 fC3                = f39
 fD0                = f40
 fD1                = f41
 fD2                = f42
 fB0                = f43
 fArgSqr            = f44
 fAbsArg            = f45
 fSignumX           = f46
 fArg4              = f47
 fArg4Sgn           = f48
 fArg3              = f49
 fArg3Sgn           = f50
 fArg7Sgn           = f51
 fArg6Sgn           = f52
 fPolC              = f53
 fPolCTmp           = f54
 fPolA              = f55
 fPolATmp           = f56
 fPolD              = f57
 fPolDTmp           = f58
 fArgSqrSgn         = f59

 // Data tables
 //==============================================================

 RODATA

 .align 16

 LOCAL_OBJECT_START(tanhf_data)
 // Polynomial coefficients for the tanh(x), 0.3125 <= |x| < 0.5
 data8 0x3F9BEEDFDD177D7B // C0
 data8 0x3F970D10C7F32458 // C1
 data8 0x3F766D6B051F3A38 // C2
 data8 0xBF732F2001B23402 // C3
 data8 0xBF854BE1CE1ED499 // D0
 data8 0x4013C944F3999A16 // D1
 data8 0xC01106C6975222C0 // D2
 data8 0x3F783D5ACCF9EBE8 // B0
 // Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
 data8 0xBF5D631440786869 // C0
 data8 0xBF575D79A0D52069 // C1
 data8 0xBF7E2237B7EFC705 // C2
 data8 0x3F6A7ACBC273041F // C3
 data8 0xC040E32EA52D91EB // D0
 data8 0x403D19463E5DB4D7 // D1
 data8 0xC02216F61F759F39 // D2
 data8 0xBF55B4EA0B844BE7 // B0
 // Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
 data8 0x3F8637DBE5B3E690 // C0
 data8 0xBF7F7FEC158C07F5 // C1
 data8 0x3F711C586706838A // C2
 data8 0xBF50EF7EF605554E // C3
 data8 0xC054D45448354E25 // D0
 data8 0x404ADFEEA282E730 // D1
 data8 0xC028AEE456D59549 // D2
 data8 0x3F25232D1BED59A8 // B0
 // Polynomial coefficients for the tanh(x), 2.0 <= |x| < 4.0
 data8 0xBF52602285F2D06C // C0
 data8 0x3F2E57C298FFE1E0 // C1
 data8 0xBF15ED575DB3C811 // C2
 data8 0x3EE428878A08525C // C3
 data8 0xC0895A26849039C1 // D0
 data8 0x406E3C60BBFBB575 // D1
 data8 0xC03A06F62867C75A // D2
 data8 0xBEB114C70F1C723E // B0
 // Polynomial coefficients for the tanh(x), 4.0 <= |x| < 8.0
 data8 0x3EF4B22BD17039A3 // C0
 data8 0xBEB704ADC040C57F // C1
 data8 0x3E937A98288AFE1A // C2
 data8 0xBE4F33B2C9FFE7E7 // C3
 data8 0xC0BE48CFADE2431E // D0
 data8 0x4090E74249760FDD // D1
 data8 0xC04B6F537FCF2F1E // D2
 data8 0x3E0DCD879C91ADEA // B0
 // Polynomial coefficients for the tanh(x), -0.3125 < x < 0.3125
 data8 0xBFD555551E8245B7 // A0
 data8 0x3FC110E63F52E689 // A1
 data8 0xBFAB8CD6A5B7BAFA // A2
 data8 0x3F945D467FCEB553 // A3
 // Polynomial coefficients for the tanh(x), 0.3125 <= |x| < 0.5
 data8 0xBE3DCC92FCAECBB6 // A0
 data8 0x3FF0000043B7D267 // A1
 data8 0xBED18BF28ACFC4B1 // A2
 data8 0xBFD554A56F82837E // A3
 // Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
 data8 0x3EFD6054758539F9 // A0
 data8 0x3FEFFBFC77198EBE // A1
 data8 0x3F700327CA98D237 // A2
 data8 0xBFD68955F5BB2FA1 // A3
 // Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
 data8 0xBF71A53F229DF01B // A0
 data8 0x3FF0AECFD730DE50 // A1
 data8 0xBFC882F88E5DF3BA // A2
 data8 0x3FC6EDF212CA2A8D // A3
 // Polynomial coefficients for the tanh(x), 2.0 <= |x| < 4.0
 data8 0xBFAF0B712E9EDA47 // A0
 data8 0x3FF1C208080BEA64 // A1
 data8 0x3FC3D29B20C8946E // A2
 data8 0xBFF04514ED900A6A // A3
 // Polynomial coefficients for the tanh(x), 4.0 <= |x| < 8.0
 data8 0xBFB1DEA49A831CBC // A0
 data8 0x3FFA729FC7085674 // A1
 data8 0xBFF2F44D923A8FA4 // A2
 data8 0x3FE092FC5712227E // A3
 // Polynomial coefficients for the tanh(x), 8.0 <= |x| <= 9.125
 data8 0x3FEFFF5769EE3041 // A0
 data8 0x3EFBBF148D850891 // A1
 data8 0xBEC86BCEF0F5C2FE // A2
 data8 0x3E7CBA4F3A885A5C // A3
 //
 data8 0x3FEFFFFFFFFFFFFF // 1.0 - epsilon
 LOCAL_OBJECT_END(tanhf_data)

 .section .text
 GLOBAL_LIBM_ENTRY(tanhf)

 { .mfi
       alloc          r32 = ar.pfs, 1, 14, 0, 0
       fmerge.s       fAbsArg = f1, f8             // |x|
       addl           rMask = 0x806, r0
 }
 { .mfi
       addl           rDataPtr = @ltoff(tanhf_data), gp
       fma.s1         fArgSqr = f8, f8, f0         // x^2
       adds           rSignBit = 0x1, r0
 }
 ;;

 { .mfi
       getf.s         rArg = f8                    // x in GR
       fclass.m       p7,p0 = f8, 0x0b             // is x denormal ?
       // sign bit and 2 most bits in significand
       shl            rMask = rMask, 20
 }
 { .mfi
       ld8            rDataPtr = [rDataPtr]
       nop.f          0
       adds           rBias2 = 0x1F4, r0
 }
 ;;

 { .mfi
       adds           rNearSaturation = 0x14, r0
       fmerge.s       fSignumX = f8, f1            // signum(x)
       shl            rSignBit = rSignBit, 31      // mask for sign bit
 }
 { .mfi
       adds           rBound = 0x3EA, r0
       nop.f          0
       addl           rSaturation = 0x4112, r0
 }
 ;;

 { .mfi
       andcm          rOffset2 = rArg, rMask
       fclass.m       p6,p0 = f8, 0xc7             // is x [S,Q]NaN or +/-0 ?
       shl            rBound = rBound, 20          // 1.0f in GR
 }
 { .mfb
       andcm          rAbsArg = rArg, rSignBit     // |x| in GR
       nop.f          0
 (p7)  br.cond.spnt   tanhf_denormal               // branch out if x is denormal
 }
 ;;

 { .mfi
       adds           rCoeffAddr2 = 352, rDataPtr
       fclass.m       p9,p0 = f8, 0x23            // is x +/- inf?
       shr            rOffset2 = rOffset2, 21
 }
 { .mfi
       cmp.lt         p10, p8 = rAbsArg, rBound   // |x| < 0.3125?
       nop.f          0
       adds           rCoeffAddr3 = 16, rDataPtr
 }
 ;;

 { .mfi
 (p8)  sub            rBias = rOffset2, rBias2
       fma.s1         fArg4 = fArgSqr, fArgSqr, f0 // x^4
       shl            rSaturation = rSaturation, 16
 }
 { .mfb
 (p10) adds           rBias = 0x14, r0
 (p6)  fma.s.s0       f8 = f8,f1,f8                // NaN or +/-0
 (p6)  br.ret.spnt    b0                           // exit for x = NaN or +/-0
 }
 ;;

 { .mfi
       shladd         rCoeffAddr1 = rBias, 4, rDataPtr
       fma.s1         fArg3Sgn = fArgSqr, f8, f0  // sign(x)*|x|^3
       // is |x| < 9.125?
       cmp.lt         p11, p12 = rAbsArg, rSaturation
 }
 { .mfi
       shladd         rCoeffAddr3 = rBias, 4, rCoeffAddr3
       fma.s1         fArg3 = fArgSqr, fAbsArg, f0 // |x|^3
       shladd         rCoeffAddr2 = rBias, 3, rCoeffAddr2
 }
 ;;

 { .mfi
 (p11) ldfpd          fC0, fC1 = [rCoeffAddr1]
 (p9)  fmerge.s       f8 = f8,f1                   // +/- inf
 (p12) adds           rDataPtr = 544, rDataPtr
 }
 { .mfb
 (p11) ldfpd          fC2, fC3 = [rCoeffAddr3], 16
       nop.f          0
 (p9)  br.ret.spnt    b0                           // exit for x = +/- inf
 }
 ;;

 { .mfi
 (p11) ldfpd          fA0, fA1 = [rCoeffAddr2], 16
       nop.f          0
 (p8)  cmp.eq.unc     p13, p0 = rBias, rNearSaturation
 }
 { .mfi
       add            rCoeffAddr1 = 48, rCoeffAddr1
       nop.f          0
       nop.i          0
 }
 ;;

 { .mfi
 (p11) ldfpd          fD0, fD1 = [rCoeffAddr3]
       nop.f          0
       nop.i          0
 }
 { .mfb
 (p11) ldfpd          fD2, fB0 = [rCoeffAddr1]
       // sign(x)*|x|^2
       fma.s1         fArgSqrSgn = fArgSqr, fSignumX, f0
 (p10) br.cond.spnt   tanhf_near_zero
 }
 ;;

 { .mfi
 (p11) ldfpd          fA2, fA3 = [rCoeffAddr2], 16
       fcmp.lt.s1     p15, p14 = f8,f0
       nop.i          0
 }
 { .mfb
 (p12) ldfd           fA0 = [rDataPtr]
       fma.s1         fArg4Sgn = fArg4, fSignumX, f0 // sign(x)*|x|^4
 (p12) br.cond.spnt   tanhf_saturation
 }
 ;;
 { .mfi
       nop.m          0
       fma.s1         fArg7Sgn = fArg4, fArg3Sgn, f0  // sign(x)*|x|^7
       nop.i          0
 }
 { .mfb
       nop.m          0
       fma.s1         fArg6Sgn = fArg3, fArg3Sgn, f0  // sign(x)*|x|^6
 (p13) br.cond.spnt   tanhf_close_to_saturation
 }
 ;;

 { .mfi
       nop.m          0
       fma.s1         fPolC = fC3, fAbsArg, fC2    // C3*|x| + C2
       nop.i          0
 }
 { .mfi
       nop.m          0
       fma.s1         fPolCTmp = fC1, fAbsArg, fC0 // C1*|x| + C0
       nop.i          0
 };;

 { .mfi
       nop.m          0
       fma.s1         fPolA = fA1, fAbsArg, fA0    // A1*|x| + A0
       nop.i          0
 }
 ;;

 { .mfi
       nop.m          0
       fma.s1         fPolD = fD1, fAbsArg, fD0    // D1*|x| + D0
       nop.i          0
 }
 { .mfi
       nop.m          0
       // sign(x)*(|x|^7 + D2*x^6)
       fma.s1         fPolDTmp = fArg6Sgn, fD2, fArg7Sgn
       nop.i          0
 };;

 { .mfi
       nop.m          0
       fma.s1         fPolATmp = fA3, fAbsArg, fA2  // A3*|x| + A2
       nop.i          0
 }
 { .mfi
       nop.m          0
       fma.s1         fB0 = fB0, fArg4, f0          // B0*x^4
       nop.i          0
 };;

 { .mfi
       nop.m          0
       // C3*|x|^3 + C2*x^2 + C1*|x| + C0
       fma.s1         fPolC = fPolC, fArgSqr, fPolCTmp
       nop.i          0
 }
 ;;

 { .mfi
       nop.m          0
       // PolD = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4)
       fma.d.s1       fPolD = fPolD, fArg4Sgn, fPolDTmp
       nop.i          0
 }
 ;;

 { .mfi
       nop.m          0
       // PolA = A3|x|^3 + A2*x^2 + A1*|x| + A0
       fma.d.s1       fPolA = fPolATmp, fArgSqr, fPolA
       nop.i          0
 }
 ;;

 { .mfi
       nop.m          0
       // PolC = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0
       fma.d.s1       fPolC = fPolC, f1, fB0
       nop.i          0
 }
 ;;

 { .mfi
       nop.m          0
 (p14) fma.s.s0       f8 = fPolC, fPolD, fPolA     // for positive x
       nop.i          0
 }
 { .mfb
       nop.m          0
 (p15) fms.s.s0       f8 = fPolC, fPolD, fPolA     // for negative x
       br.ret.sptk    b0                           // Exit for 0.3125 <=|x|< 8.0
 };;


 // Here if |x| < 0.3125
 tanhf_near_zero:
 { .mfi
       nop.m          0
       fma.s1         fPolC = fC3, fArgSqr, fC2    // C3*x^2 + C2
       nop.i          0
 }
 { .mfi
       nop.m          0
       fma.s1         fPolCTmp = fC1, fArgSqr, fC0  // C1*x^2 + C0
       nop.i          0
 };;

 { .mfi
       nop.m          0
       fma.s1         fPolC = fPolC, fArg4, fPolCTmp // C3*x^6 + C2*x^4 + C1*x^2 + C0
       nop.i          0
 };;

 { .mfb
       nop.m          0
       // x + x^3*(C3*x^6 + C2*x^4 + C1*x^2 + C0)
       fma.s.s0       f8 = fPolC, fArg3Sgn, f8
       br.ret.sptk    b0                           // Exit for |x| < 0.3125
 };;

 // Here if 9.125 <= |x| < +inf
 tanhf_saturation:
 { .mfb
       nop.m          0
       fma.s.s0       f8 = fA0, fSignumX, f0       // sign(x)*(1.0d - 2^(-52))
       // Exit for 9.125 <= |x| < +inf
       br.ret.sptk    b0                           // Exit for 9.125 <=|x|< +inf
 }
 ;;

 // Here if  8.0 <= |x| < 9.125
 tanhf_close_to_saturation:
 { .mfi
       nop.m          0
       fma.s1         fPolATmp = fA1, fAbsArg, fA0 // A1*|x| + A0
       nop.i          0
 }
 { .mfi
       nop.m          0
       fma.s1         fPolA = fA3, fAbsArg, fA2    // A3*|x| + A2
       nop.i          0
 }
 ;;

 .pred.rel "mutex", p14, p15
 { .mfi
       nop.m          0
       // for positive x
 (p14) fma.s.s0       f8 = fPolA, fArgSqr, fPolATmp
       nop.i          0
 }
 { .mfb
       nop.m          0
       // for negative x
 (p15) fms.s.s0       f8 = fPolA, fArgSqrSgn, fPolATmp
       br.ret.sptk    b0                           // Exit for 8.0 <=|x|< 9.125
 };;

 // Here if x is single precision denormal
 tanhf_denormal:
 { .mfi
       nop.m          0
       fclass.m       p7,p8 = f8, 0x0a             // is x -denormal ?
       nop.i          0
 }
 ;;

 { .mfi
       nop.m          0
 (p7)  fma.s.s0       f8 = f8,f8,f8                // -denormal
       nop.i          0
 }
 { .mfb
       nop.m          0
 (p8)  fnma.s.s0      f8 = f8,f8,f8                // +denormal
       br.ret.sptk    b0                           // Exit for denormal
 }
 ;;

 GLOBAL_LIBM_END(tanhf)
	.file "tanhf.s"


	// Copyright (c) 2001 - 2005, Intel Corporation
	// All rights reserved.
	//
	// Contributed 2001 by the Intel Numerics Group, Intel Corporation
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are
	// met:
	//
	// * Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	//
	// * Redistributions in binary form must reproduce the above copyright
	// notice, this list of conditions and the following disclaimer in the
	// documentation and/or other materials provided with the distribution.
	//
	// * The name of Intel Corporation may not be used to endorse or promote
	// products derived from this software without specific prior written
	// permission.

	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
	// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
	// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	//
	// Intel Corporation is the author of this code, and requests that all
	// problem reports or change requests be submitted to it directly at
	// http://www.intel.com/software/products/opensource/libraries/num.htm.
	//
	// History
	//==============================================================
	// 05/30/01 Initial version
	// 05/20/02 Cleaned up namespace and sf0 syntax
	// 02/10/03 Reordered header: .section, .global, .proc, .align
	// 03/31/05 Reformatted delimiters between data tables
	//
	// API
	//==============================================================
	// float tanhf(float)
	//
	// Overview of operation
	//==============================================================
	// Background
	//
	//
	// There are 9 paths:
	// 1. x = +/-0.0
	// Return tanhf(x) = +/-0.0
	//
	// 2. 0.0 < \|x\| < 0.3125
	// Return tanhf(x) = x + x^3*Pol3(x^2),
	// where Pol3(x^2) = C3x^6 + C2x^4 + C1*x^2 + C0
	//
	// 3. 0.3125 <= \|x\| < 8.0
	// Return tanhf(x) = sign(x)PolD(x)PolC(\|x\|) + sign(x)*PolA(\|x\|),
	// where sign(x)PolD(x) = sign(x)(\|x\|^7 + D2x^6 + D1\|x\|^5 + D0*x^4),
	// PolC(\|x\|) = B0x^4 + C3\|x\|^3 + C2\|x\|^2 + C1\|x\| + C0,
	// PolA(\|x\|) = A3\|x\|^3 + A2x^2 + A1\|x\| + A0
	//
	// Actually range 0.3125<=\|x\|< 8.0 is split to 5 subranges.
	// For each subrange there is particular set of coefficients.
	// Below is the list of subranges:
	// 3.1 0.3125 <= \|x\| < 0.5
	// 3.2 0.5 <= \|x\| < 1.0
	// 3.3 1.0 <= \|x\| < 2.0
	// 3.4 2.0 <= \|x\| < 4.0
	// 3.5 4.0 <= \|x\| < 8.0
	//
	// 4. 8.0 <= \|x\| < 9.125
	// Return tanhf(x) = sign(x)(A3\|x\|^3 + A2x^2 + A1*\|x\| + A0)
	//
	// 5. 9.125 <= \|x\| < +INF
	// Return tanhf(x) = sign(x)*(1.0d - 2^(-52))
	//
	// 6. \|x\| = INF
	// Return tanhf(x) = sign(x) * 1.0
	//
	// 7. x = [S,Q]NaN
	// Return tanhf(x) = QNaN
	//
	// 8. x is positive denormal
	// Return tanhf(x) = x - x^2
	//
	// 9. x is negative denormal
	// Return tanhf(x) = x + x^2
	//
	// Registers used
	//==============================================================
	// Floating Point registers used:
	// f8, input
	// f32 -> f59

	// General registers used:
	// r32 -> r46, r2, r3

	// Predicate registers used:
	// p0, p6 -> p15

	// p6 to filter out case when x = [Q,S]NaN or +/-0
	// p7 to filter out case when x = denormal
	// p8 set if \|x\| >= 0.3125, used also to process denormal input
	// p9 to filter out case when \|x\| = inf
	// p10 to filter out case when \|x\| < 0.3125
	// p11 to filter out case when 0.3125 <= \|x\| < 9.125
	// p12 to filter out case when \|x\| >= 9.125
	// p13 to filter out case when 8.0 <= \|x\| < 9.125
	// p14 set to 1 for positive x
	// p15 set to 1 for negative x

	// Assembly macros
	//==============================================================
	rDataPtr = r2
	rDataPtr1 = r3

	rBias = r33
	rCoeffAddr3 = r34
	rNearSaturation = r35
	rCoeffAddr1 = r36
	rCoeffAddr2 = r37
	rOffset2 = r38
	rBias2 = r39
	rMask = r40
	rArg = r41
	rBound = r42
	rSignBit = r43
	rAbsArg = r44
	rDataPtr2 = r45
	rSaturation = r46

	//==============================================================
	fA0 = f32
	fA1 = f33
	fA2 = f34
	fA3 = f35
	fC0 = f36
	fC1 = f37
	fC2 = f38
	fC3 = f39
	fD0 = f40
	fD1 = f41
	fD2 = f42
	fB0 = f43
	fArgSqr = f44
	fAbsArg = f45
	fSignumX = f46
	fArg4 = f47
	fArg4Sgn = f48
	fArg3 = f49
	fArg3Sgn = f50
	fArg7Sgn = f51
	fArg6Sgn = f52
	fPolC = f53
	fPolCTmp = f54
	fPolA = f55
	fPolATmp = f56
	fPolD = f57
	fPolDTmp = f58
	fArgSqrSgn = f59

	// Data tables
	//==============================================================

	RODATA

	.align 16

	LOCAL_OBJECT_START(tanhf_data)
	// Polynomial coefficients for the tanh(x), 0.3125 <= \|x\| < 0.5
	data8 0x3F9BEEDFDD177D7B // C0
	data8 0x3F970D10C7F32458 // C1
	data8 0x3F766D6B051F3A38 // C2
	data8 0xBF732F2001B23402 // C3
	data8 0xBF854BE1CE1ED499 // D0
	data8 0x4013C944F3999A16 // D1
	data8 0xC01106C6975222C0 // D2
	data8 0x3F783D5ACCF9EBE8 // B0
	// Polynomial coefficients for the tanh(x), 0.5 <= \|x\| < 1.0
	data8 0xBF5D631440786869 // C0
	data8 0xBF575D79A0D52069 // C1
	data8 0xBF7E2237B7EFC705 // C2
	data8 0x3F6A7ACBC273041F // C3
	data8 0xC040E32EA52D91EB // D0
	data8 0x403D19463E5DB4D7 // D1
	data8 0xC02216F61F759F39 // D2
	data8 0xBF55B4EA0B844BE7 // B0
	// Polynomial coefficients for the tanh(x), 1.0 <= \|x\| < 2.0
	data8 0x3F8637DBE5B3E690 // C0
	data8 0xBF7F7FEC158C07F5 // C1
	data8 0x3F711C586706838A // C2
	data8 0xBF50EF7EF605554E // C3
	data8 0xC054D45448354E25 // D0
	data8 0x404ADFEEA282E730 // D1
	data8 0xC028AEE456D59549 // D2
	data8 0x3F25232D1BED59A8 // B0
	// Polynomial coefficients for the tanh(x), 2.0 <= \|x\| < 4.0
	data8 0xBF52602285F2D06C // C0
	data8 0x3F2E57C298FFE1E0 // C1
	data8 0xBF15ED575DB3C811 // C2
	data8 0x3EE428878A08525C // C3
	data8 0xC0895A26849039C1 // D0
	data8 0x406E3C60BBFBB575 // D1
	data8 0xC03A06F62867C75A // D2
	data8 0xBEB114C70F1C723E // B0
	// Polynomial coefficients for the tanh(x), 4.0 <= \|x\| < 8.0
	data8 0x3EF4B22BD17039A3 // C0
	data8 0xBEB704ADC040C57F // C1
	data8 0x3E937A98288AFE1A // C2
	data8 0xBE4F33B2C9FFE7E7 // C3
	data8 0xC0BE48CFADE2431E // D0
	data8 0x4090E74249760FDD // D1
	data8 0xC04B6F537FCF2F1E // D2
	data8 0x3E0DCD879C91ADEA // B0
	// Polynomial coefficients for the tanh(x), -0.3125 < x < 0.3125
	data8 0xBFD555551E8245B7 // A0
	data8 0x3FC110E63F52E689 // A1
	data8 0xBFAB8CD6A5B7BAFA // A2
	data8 0x3F945D467FCEB553 // A3
	// Polynomial coefficients for the tanh(x), 0.3125 <= \|x\| < 0.5
	data8 0xBE3DCC92FCAECBB6 // A0
	data8 0x3FF0000043B7D267 // A1
	data8 0xBED18BF28ACFC4B1 // A2
	data8 0xBFD554A56F82837E // A3
	// Polynomial coefficients for the tanh(x), 0.5 <= \|x\| < 1.0
	data8 0x3EFD6054758539F9 // A0
	data8 0x3FEFFBFC77198EBE // A1
	data8 0x3F700327CA98D237 // A2
	data8 0xBFD68955F5BB2FA1 // A3
	// Polynomial coefficients for the tanh(x), 1.0 <= \|x\| < 2.0
	data8 0xBF71A53F229DF01B // A0
	data8 0x3FF0AECFD730DE50 // A1
	data8 0xBFC882F88E5DF3BA // A2
	data8 0x3FC6EDF212CA2A8D // A3
	// Polynomial coefficients for the tanh(x), 2.0 <= \|x\| < 4.0
	data8 0xBFAF0B712E9EDA47 // A0
	data8 0x3FF1C208080BEA64 // A1
	data8 0x3FC3D29B20C8946E // A2
	data8 0xBFF04514ED900A6A // A3
	// Polynomial coefficients for the tanh(x), 4.0 <= \|x\| < 8.0
	data8 0xBFB1DEA49A831CBC // A0
	data8 0x3FFA729FC7085674 // A1
	data8 0xBFF2F44D923A8FA4 // A2
	data8 0x3FE092FC5712227E // A3
	// Polynomial coefficients for the tanh(x), 8.0 <= \|x\| <= 9.125
	data8 0x3FEFFF5769EE3041 // A0
	data8 0x3EFBBF148D850891 // A1
	data8 0xBEC86BCEF0F5C2FE // A2
	data8 0x3E7CBA4F3A885A5C // A3
	//
	data8 0x3FEFFFFFFFFFFFFF // 1.0 - epsilon
	LOCAL_OBJECT_END(tanhf_data)

	.section .text
	GLOBAL_LIBM_ENTRY(tanhf)

	{ .mfi
	alloc r32 = ar.pfs, 1, 14, 0, 0
	fmerge.s fAbsArg = f1, f8 // \|x\|
	addl rMask = 0x806, r0
	}
	{ .mfi
	addl rDataPtr = @ltoff(tanhf_data), gp
	fma.s1 fArgSqr = f8, f8, f0 // x^2
	adds rSignBit = 0x1, r0
	}
	;;

	{ .mfi
	getf.s rArg = f8 // x in GR
	fclass.m p7,p0 = f8, 0x0b // is x denormal ?
	// sign bit and 2 most bits in significand
	shl rMask = rMask, 20
	}
	{ .mfi
	ld8 rDataPtr = [rDataPtr]
	nop.f 0
	adds rBias2 = 0x1F4, r0
	}
	;;

	{ .mfi
	adds rNearSaturation = 0x14, r0
	fmerge.s fSignumX = f8, f1 // signum(x)
	shl rSignBit = rSignBit, 31 // mask for sign bit
	}
	{ .mfi
	adds rBound = 0x3EA, r0
	nop.f 0
	addl rSaturation = 0x4112, r0
	}
	;;

	{ .mfi
	andcm rOffset2 = rArg, rMask
	fclass.m p6,p0 = f8, 0xc7 // is x [S,Q]NaN or +/-0 ?
	shl rBound = rBound, 20 // 1.0f in GR
	}
	{ .mfb
	andcm rAbsArg = rArg, rSignBit // \|x\| in GR
	nop.f 0
	(p7) br.cond.spnt tanhf_denormal // branch out if x is denormal
	}
	;;

	{ .mfi
	adds rCoeffAddr2 = 352, rDataPtr
	fclass.m p9,p0 = f8, 0x23 // is x +/- inf?
	shr rOffset2 = rOffset2, 21
	}
	{ .mfi
	cmp.lt p10, p8 = rAbsArg, rBound // \|x\| < 0.3125?
	nop.f 0
	adds rCoeffAddr3 = 16, rDataPtr
	}
	;;

	{ .mfi
	(p8) sub rBias = rOffset2, rBias2
	fma.s1 fArg4 = fArgSqr, fArgSqr, f0 // x^4
	shl rSaturation = rSaturation, 16
	}
	{ .mfb
	(p10) adds rBias = 0x14, r0
	(p6) fma.s.s0 f8 = f8,f1,f8 // NaN or +/-0
	(p6) br.ret.spnt b0 // exit for x = NaN or +/-0
	}
	;;

	{ .mfi
	shladd rCoeffAddr1 = rBias, 4, rDataPtr
	fma.s1 fArg3Sgn = fArgSqr, f8, f0 // sign(x)*\|x\|^3
	// is \|x\| < 9.125?
	cmp.lt p11, p12 = rAbsArg, rSaturation
	}
	{ .mfi
	shladd rCoeffAddr3 = rBias, 4, rCoeffAddr3
	fma.s1 fArg3 = fArgSqr, fAbsArg, f0 // \|x\|^3
	shladd rCoeffAddr2 = rBias, 3, rCoeffAddr2
	}
	;;

	{ .mfi
	(p11) ldfpd fC0, fC1 = [rCoeffAddr1]
	(p9) fmerge.s f8 = f8,f1 // +/- inf
	(p12) adds rDataPtr = 544, rDataPtr
	}
	{ .mfb
	(p11) ldfpd fC2, fC3 = [rCoeffAddr3], 16
	nop.f 0
	(p9) br.ret.spnt b0 // exit for x = +/- inf
	}
	;;

	{ .mfi
	(p11) ldfpd fA0, fA1 = [rCoeffAddr2], 16
	nop.f 0
	(p8) cmp.eq.unc p13, p0 = rBias, rNearSaturation
	}
	{ .mfi
	add rCoeffAddr1 = 48, rCoeffAddr1
	nop.f 0
	nop.i 0
	}
	;;

	{ .mfi
	(p11) ldfpd fD0, fD1 = [rCoeffAddr3]
	nop.f 0
	nop.i 0
	}
	{ .mfb
	(p11) ldfpd fD2, fB0 = [rCoeffAddr1]
	// sign(x)*\|x\|^2
	fma.s1 fArgSqrSgn = fArgSqr, fSignumX, f0
	(p10) br.cond.spnt tanhf_near_zero
	}
	;;

	{ .mfi
	(p11) ldfpd fA2, fA3 = [rCoeffAddr2], 16
	fcmp.lt.s1 p15, p14 = f8,f0
	nop.i 0
	}
	{ .mfb
	(p12) ldfd fA0 = [rDataPtr]
	fma.s1 fArg4Sgn = fArg4, fSignumX, f0 // sign(x)*\|x\|^4
	(p12) br.cond.spnt tanhf_saturation
	}
	;;
	{ .mfi
	nop.m 0
	fma.s1 fArg7Sgn = fArg4, fArg3Sgn, f0 // sign(x)*\|x\|^7
	nop.i 0
	}
	{ .mfb
	nop.m 0
	fma.s1 fArg6Sgn = fArg3, fArg3Sgn, f0 // sign(x)*\|x\|^6
	(p13) br.cond.spnt tanhf_close_to_saturation
	}
	;;

	{ .mfi
	nop.m 0
	fma.s1 fPolC = fC3, fAbsArg, fC2 // C3*\|x\| + C2
	nop.i 0
	}
	{ .mfi
	nop.m 0
	fma.s1 fPolCTmp = fC1, fAbsArg, fC0 // C1*\|x\| + C0
	nop.i 0
	};;

	{ .mfi
	nop.m 0
	fma.s1 fPolA = fA1, fAbsArg, fA0 // A1*\|x\| + A0
	nop.i 0
	}
	;;

	{ .mfi
	nop.m 0
	fma.s1 fPolD = fD1, fAbsArg, fD0 // D1*\|x\| + D0
	nop.i 0
	}
	{ .mfi
	nop.m 0
	// sign(x)(\|x\|^7 + D2x^6)
	fma.s1 fPolDTmp = fArg6Sgn, fD2, fArg7Sgn
	nop.i 0
	};;

	{ .mfi
	nop.m 0
	fma.s1 fPolATmp = fA3, fAbsArg, fA2 // A3*\|x\| + A2
	nop.i 0
	}
	{ .mfi
	nop.m 0
	fma.s1 fB0 = fB0, fArg4, f0 // B0*x^4
	nop.i 0
	};;

	{ .mfi
	nop.m 0
	// C3\|x\|^3 + C2x^2 + C1*\|x\| + C0
	fma.s1 fPolC = fPolC, fArgSqr, fPolCTmp
	nop.i 0
	}
	;;

	{ .mfi
	nop.m 0
	// PolD = sign(x)(\|x\|^7 + D2x^6 + D1\|x\|^5 + D0x^4)
	fma.d.s1 fPolD = fPolD, fArg4Sgn, fPolDTmp
	nop.i 0
	}
	;;

	{ .mfi
	nop.m 0
	// PolA = A3\|x\|^3 + A2x^2 + A1\|x\| + A0
	fma.d.s1 fPolA = fPolATmp, fArgSqr, fPolA
	nop.i 0
	}
	;;

	{ .mfi
	nop.m 0
	// PolC = B0x^4 + C3\|x\|^3 + C2\|x\|^2 + C1\|x\| + C0
	fma.d.s1 fPolC = fPolC, f1, fB0
	nop.i 0
	}
	;;

	{ .mfi
	nop.m 0
	(p14) fma.s.s0 f8 = fPolC, fPolD, fPolA // for positive x
	nop.i 0
	}
	{ .mfb
	nop.m 0
	(p15) fms.s.s0 f8 = fPolC, fPolD, fPolA // for negative x
	br.ret.sptk b0 // Exit for 0.3125 <=\|x\|< 8.0
	};;


	// Here if \|x\| < 0.3125
	tanhf_near_zero:
	{ .mfi
	nop.m 0
	fma.s1 fPolC = fC3, fArgSqr, fC2 // C3*x^2 + C2
	nop.i 0
	}
	{ .mfi
	nop.m 0
	fma.s1 fPolCTmp = fC1, fArgSqr, fC0 // C1*x^2 + C0
	nop.i 0
	};;

	{ .mfi
	nop.m 0
	fma.s1 fPolC = fPolC, fArg4, fPolCTmp // C3x^6 + C2x^4 + C1*x^2 + C0
	nop.i 0
	};;

	{ .mfb
	nop.m 0
	// x + x^3(C3x^6 + C2x^4 + C1x^2 + C0)
	fma.s.s0 f8 = fPolC, fArg3Sgn, f8
	br.ret.sptk b0 // Exit for \|x\| < 0.3125
	};;

	// Here if 9.125 <= \|x\| < +inf
	tanhf_saturation:
	{ .mfb
	nop.m 0
	fma.s.s0 f8 = fA0, fSignumX, f0 // sign(x)*(1.0d - 2^(-52))
	// Exit for 9.125 <= \|x\| < +inf
	br.ret.sptk b0 // Exit for 9.125 <=\|x\|< +inf
	}
	;;

	// Here if 8.0 <= \|x\| < 9.125
	tanhf_close_to_saturation:
	{ .mfi
	nop.m 0
	fma.s1 fPolATmp = fA1, fAbsArg, fA0 // A1*\|x\| + A0
	nop.i 0
	}
	{ .mfi
	nop.m 0
	fma.s1 fPolA = fA3, fAbsArg, fA2 // A3*\|x\| + A2
	nop.i 0
	}
	;;

	.pred.rel "mutex", p14, p15
	{ .mfi
	nop.m 0
	// for positive x
	(p14) fma.s.s0 f8 = fPolA, fArgSqr, fPolATmp
	nop.i 0
	}
	{ .mfb
	nop.m 0
	// for negative x
	(p15) fms.s.s0 f8 = fPolA, fArgSqrSgn, fPolATmp
	br.ret.sptk b0 // Exit for 8.0 <=\|x\|< 9.125
	};;

	// Here if x is single precision denormal
	tanhf_denormal:
	{ .mfi
	nop.m 0
	fclass.m p7,p8 = f8, 0x0a // is x -denormal ?
	nop.i 0
	}
	;;

	{ .mfi
	nop.m 0
	(p7) fma.s.s0 f8 = f8,f8,f8 // -denormal
	nop.i 0
	}
	{ .mfb
	nop.m 0
	(p8) fnma.s.s0 f8 = f8,f8,f8 // +denormal
	br.ret.sptk b0 // Exit for denormal
	}
	;;

	GLOBAL_LIBM_END(tanhf)