VirtuaNet Code

Status: Pre-Alpha
Brought to you by: ace123, hellcatv
[r69]: / grueworld / brain / qsqrt.h Maximize Restore History

178 lines (153 with data), 5.3 kB

/*
    Copyright (C) 2000 by Andrew Zabolotny (Intel version)
    Copyright (C) 2002 by Matthew Reda <reda@mac.com> (PowerPC version)
    Fast computation of sqrt(x) and 1/sqrt(x)
  
    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Library General Public
    License as published by the Free Software Foundation; either
    version 2 of the License, or (at your option) any later version.
  
    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Library General Public License for more details.
  
    You should have received a copy of the GNU Library General Public
    License along with this library; if not, write to the Free
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/

// Uncomment the following line to define CS_NO_QSQRT if you experience
// mysterious problems with CS which you think are related to this
// version of sqrt not behaving properly. If you find something like
// that I'd like to be notified of this so we can make sure this really
// is the problem.
//#define CS_NO_QSQRT

#ifndef __QSQRT_H__
#define __QSQRT_H__

#if (!defined (CS_NO_QSQRT)) && defined (PROC_X86) && defined (COMP_GCC)

/*
  NB: Single-precision floating-point format (32 bits):
	SEEEEEEE.EMMMMMMM.MMMMMMMM.MMMMMMMM
	S: Sign (0 - positive, 1 - negative)
	E: Exponent (plus 127, 8 bits)
	M: Mantissa (23 bits)
*/

/**
 * This is a relatively small inline routine which will compute sqrt(x)
 * very quick, taking a square root is now just a little longer than
 * doing a division. The function is inline so that it runs at maximal
 * possible speed. It does eight multiplications but multiplication is
 * cheap on P5+ processors (3 clocks). On a Celeron CPU it takes
 * approximatively 50 clocks, while a division is 42 clocks.
 */
static inline float qsqrt (float x)
{
  float ret;

// Original C++ formulae:
// float tmp = x;
// *((unsigned *)&tmp) = (0xbe6f0000 - *((unsigned *)&tmp)) >> 1;
// double h = x * 0.5;
// double a = tmp;
// a *= 1.5 - a * a * h;
// a *= 1.5 - a * a * h;
// return a * x;

  __asm__ (
		"flds	%1\n"			// x
		"movl	$0xbe6f0000,%%eax\n"
		"subl	%1,%%eax\n"
		"shrl	$1,%%eax\n"
		"movl	%%eax,%1\n"
		"flds	%2\n"			// x 0.5
		"fmul	%%st(1)\n"		// x h
		"flds	%3\n"			// x h 1.5
		"flds	%1\n"			// x h 1.5 a
		"fld	%%st\n"			// x h 1.5 a a
		"fmul	%%st\n"			// x h 1.5 a a*a
		"fmul	%%st(3)\n"		// x h 1.5 a a*a*h
		"fsubr	%%st(2)\n"		// x h 1.5 a 1.5-a*a*h
		"fmulp	%%st(1)\n"		// x h 1.5 a
		"fld	%%st\n"			// x h 1.5 a a
		"fmul	%%st\n"			// x h 1.5 a a*a
		"fmulp	%%st(3)\n"		// x a*a*h 1.5 a
		"fxch\n"			// x a*a*h a 1.5
		"fsubp  %%st,%%st(2)\n"		// x 1.5-a*a*h a
		"fmulp	%%st(1)\n"		// x a
		"fmulp	%%st(1)\n"		// a
	: "=&t" (ret), "+m" (x) : "m" (0.5F), "m" (1.5F)
	: "eax", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
  );
  return ret;
}

/**
 * This routine is basically equivalent to qsqrt() except that it returns
 * 1/sqrt(x) rather than the proper square root. It should be used anywhere
 * you need the inverse root (in 3D graphics it is a common situation),
 * because the routine is a little faster than qsqrt() and also you avoid
 * a division (which gives you an overall 2X speedup).
 */
static inline float qisqrt (float x)
{
  float ret;
  __asm__ (
		"flds	%1\n"			// x
		"movl	$0xbe6f0000,%%eax\n"
		"subl	%1,%%eax\n"
		"shrl	$1,%%eax\n"
		"movl	%%eax,%1\n"
		"flds	%2\n"			// x 0.5
		"fmulp	%%st(1)\n"		// h
		"flds	%3\n"			// h 1.5
		"flds	%1\n"			// h 1.5 a
		"fld	%%st\n"			// h 1.5 a a
		"fmul	%%st\n"			// h 1.5 a a*a
		"fmul	%%st(3)\n"		// h 1.5 a a*a*h
		"fsubr	%%st(2)\n"		// h 1.5 a 1.5-a*a*h
		"fmulp	%%st(1)\n"		// h 1.5 a
		"fld	%%st\n"			// h 1.5 a a
		"fmul	%%st\n"			// h 1.5 a a*a
		"fmulp	%%st(3)\n"		// a*a*h 1.5 a
		"fxch\n"			// a*a*h a 1.5
		"fsubp  %%st,%%st(2)\n"		// 1.5-a*a*h a
		"fmulp	%%st(1)\n"		// a
	: "=t" (ret), "+m" (x) : "m" (0.5F), "m" (1.5F)
	: "eax", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
  );
  return ret;
}

#elif (!defined (CS_NO_QSQRT)) && defined (PROC_POWERPC) && defined (COMP_GCC)

/**
 * Use the PowerPC fsqrte to get an estimate of 1/sqrt(x) Then apply two
 * Newton-Rhaphson refinement steps to get a more accurate response Finally
 * multiply by x to get x/sqrt(x) = sqrt(x).  Add additional refinement steps
 * to get a more accurate result.  Zero is treated as a special case, otherwise
 * we end up returning NaN (Not a Number).
 */
static inline float qsqrt(float x)
{
  float y0 = 0.0;

  if (x != 0.0)
  {
    float x0 = x * 0.5f;

    asm ("frsqrte %0,%1" : "=f" (y0) : "f" (x));
    
    y0 = y0 * (1.5f - x0 * y0 * y0);
    y0 = (y0 * (1.5f - x0 * y0 * y0)) * x;
  };
    
  return y0;
};

/**
 * Similar to qsqrt() above, except we do not multiply by x at the end, and
 * return 1/sqrt(x).
 */
static inline float qisqrt(float x)
{
  float x0 = x * 0.5f;
  float y0;
  asm ("frsqrte %0,%1" : "=f" (y0) : "f" (x));
    
  y0 = y0 * (1.5f - x0 * y0 * y0);
  y0 = y0 * (1.5f - x0 * y0 * y0);

  return y0;
};

#else

#include <math.h>
#define qsqrt(x)  sqrt(x)
#define qisqrt(x) (1.0/sqrt(x))

#endif

#endif // __QSQRT_H__