/* Single-precision floating point square root Optimized for G3 Powerpc32 processors written By Conn Clark. Copyright (C) 2004, 2006 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301 USA. Notes: This is sqrtf function utilizes the frsqrte instruction which is present on the 603, 603e, 604, 604e, G3, G4 and G5 processors. It generates ieee compliant results as long as the frsqrte returns an estimate that is within 1/59th the correct number. As far as I know the G5 is the only processor that has less accuracy than this. The 603, 603e, 604, and 604e may have less accuracy but I don't have a system to test on. This function won't work on a 601 or other processor that doesn't support the frsqrte instruction. If the result of the frsqrte instruction is less than 1/59th the correct value but with in 1/32 this function will return a value that is at most 1/4.22e+06 off. Algorithm Notes: This takes the reciprocal square root estimate and feeds it into three Newton Goldschmidt iterations. The result is a reciprocal square root. This value is then multiplied by the input value to get the square root. Care has been taken to avoid loading constants from the data segment of Glibc. This was done to avoid a data cache load. Optimization emphasis has been on dealing with valid inputs ( 0 <= x < +infinity). Dealing with invalid inputs could be done faster or better. */ .section ".text" .align 2 .globl __slow_ieee754_sqrtf .type __slow_ieee754_sqrtf, @function __slow_ieee754_sqrtf: b __ieee754_sqrtf .size __slow_ieee754_sqrtf, .-__slow_ieee754_sqrtf .section ".text" .align 2 .globl __ieee754_sqrtf .type __ieee754_sqrtf, @function __ieee754_sqrtf: /* start loading some constants for integer comparison */ lis 3,0x3f00 /* 0.5F equiv as an integer */ lis 4,0x3FC0 /* 1.5F equiv as an integer */ stfsu 1,-12(1) /* store original and get 12 bytes of stack space. */ stw 3,4(1) /* store 0.5F on the stack to be loaded by fpu */ mffs 6 /* store fpu configuration */ stw 4,8(1) /* store 1.5F on the stack to be loaded by fpu */ lis 7, 0x7F80 /* load NAN for testing */ lfs 2,4(1) /* load 0.5F into fpu reg 2 */ /*mtfsfi 7,0 */ /* relax fpu (not needed) */ lwz 5,0(1) /* load original value as an int for testing */ fmr 9,1 /* copy original value into fpu reg 9 */ cmpi 0,5,0x0000 /* test for positive zero */ rlwinm 12,5,0,0,1 /* mask off sign bit and store in reg 12*/ lfs 7,8(1) /* load 1.5F into fpu reg 7 */ cmpl 1,12,7 /* test for NAN results in cr1 */ ble neg_number_or_zero /* branch if less than or equal to zero */ frsqrte 1,1 /* get recip sqrt estimate (does no harm if input is NaN */ beq cr1, not_a_number /* branch if original value was not a number */ fmul 2,2,9 /* begin Goldschmidt */ fmuls 3,1,1 /* saves one clock without effecting accuracy */ fmul 4,2,3 fnmsubs 3,2,3,7 /* saves one clock without effecting accuracy */ fmul 5,3,3 fmul 1,3,1 fnmsubs 3,4,5,7 /* saves one clock without effecting accuracy */ fmul 4,4,5 fmul 1,3,1 fmul 5,3,3 fnmsub 3,4,5,7 fmul 1,3,1 fmul 1,9,1 /* get sqrt from recip sqrt */ addi 1,1,12 /* clean up stack */ mtfsf 0xff,6 /* restore fpu state */ frsp 1,1 /* round result to a float */ blr /* return */ neg_number_or_zero: lis 9,0x3F80 /* load up equiv of 1.0F */ stw 9,8(1) /* store 1.0F to where fpu can load it */ lis 6, 0x8000 /* negative */ lfs 2,8(1) /* load 1.0F into fpu reg 2 */ beq its_zero /* branch if zero */ cmpl 0,5,6 /* test for negative zero */ be its_zero /* branch if zero */ stfs 6,0(1) /* store fpu status */ lis 3,0x7FC0 /* load aNaN */ lis 4,0x2000 /* load FE_INVALID flag */ lwz 5,0(1) /* load fpu status in gp register */ ori 4,4,0x0200 /* load INV_SQRT flag */ or 5,5,4 /* or FE_INVALID and INV_SQRT flags with fpu status */ stw 3,4(1) /* store aNaN on stack */ stw 5,0(1) /* store fpu status */ lfs 6,0(1) /* load new fpu status */ lfs 1,4(1) /* load aNaN to be returned */ mtfsf 0xff,6 /* update fpu status to new value */ its_zero: fmuls 1,1,2 /* multiply by 1.0 to set appropriate status bits */ addi 1,1,12 /* clean up stack */ blr /* return */ not_a_number: lis 6,0x3F80 /* load up equiv of 1.0F */ stw 6,8(1) /* store 1.0F to where fpu can load it */ lfs 2,8(1) /* load 1.0F into fpu reg 2 */ mtfsf 0xff,6 /* restore fpu status */ fmuls 1,1,2 /* multiply by 1.0 to set appropriate status bits */ addi 1,1,12 /* clean up stack */ blr /* return */ .size __ieee754_sqrtf, .-__ieee754_sqrtf