/*----------------------------------------------------------------------
blokpi.c: calculate pi v1.0
This code is for a very fast pi calculator that adds up series with
a certain special form. It uses block summation (see the blokpi.txt
for what this means and how it works) and can be configured at com-
pile time for optimal pi-crunching speed across different platforms.
I designed it to be compiled using djgpp on the Pentium processor
(since that's what I have), and it runs fastest for these, but the
code involved is generic enough so that any ANSI C compiler will do.
Another plus is that the series added up can be changed with a min-
imum of effort, so this program can *also* add up similar series
for other constants, also at breakneck speed.
This program is placed in the public domain by its author, Jason
Papadopoulos. Do whatever the heck you want with it, but if you
figure out a way to speed it up, please be kind and send an e-mail
to jasonp@glue.umd.edu
--------------------------------------------------------------------*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
/* compile for Intel FPU */
#define INTEL
/* number of doubles in 1 block */
#define BLOCKPI 1000
/* number of fractions in 1 block.
If GCCDOS is defined this must
be a multiple of 4; otherwise it
must be a multiple of 3. To be
really general, make it a mul-
tiple of 12). NOTE: it must also
be a multiple of FRACTIONS_PER_TERM */
#define BLOCKTERM 252
/* special high-performance code is
available when compiled using djgpp
for DOS. Runs about 18% faster. */
#define
GCCDOS
/* specifics of the formula used */
#define FRACTIONS_PER_TERM 2
#define CORRECT_DIGITS_PER_TERM 1.4314
double stuff[14] = { 0, 0, /* don't mess with these; filled later */
13, /* 1st
additive constant; req'd */
7, /*
increment for 1st additive constant */
8, /*
numerator of scaling constant */
15, /*
denominator of scaling constant */
2, /*
1st
numerator
*/
2, /*
increment for 1st
numerator */
11, /* 1st
denominator
*/
6, /*
increment for 1st denominator */
1, /*
2nd
numerator
*/
2, /*
increment for 2nd
numerator */
-21, /* 2nd
denominator
*/
-18}; /* increment for
2nd denominator */
/* ...add more fractions if needed... */
/* Pick the base you want to work in, i.e.
how many digits get stored in a double.
For Intel registers, BASEDIGITS + (max
number of digits in num and denom of
fractions) = 18. For anything else the
sum is 14 (the IEEE guarantees at least
that much). The more terms you want to
add up the smaller BASE must be. 10 is
a good default for Intel stuff */
#define BASE 1.0e10
#define BASEDIGITS 10
/******************* CODE STARTS HERE ***********************************/
typedef struct {
double recip;
double num;
double den;
double rem;
} term;
#ifdef INTEL
#define ROUND 3.0 * 2147483648.0 * 2147483648.0 /* 3*2^62 */
#else
#define ROUND 3.0 * 67108864.0 *
67108864.0
/* 3*2^52 */
#endif
#ifdef GCCDOS
/*-----------------------ASSEMBLY CODED NUMBER CRUNCHING ROUTINE -----*/
#define PIPELINE 4
void crunch( double *pi, long length, term *t ) {
/* High-performance djgpp code that multiplies a block of the
array pi by a quartet of fractions t[0],t[1],t[2],t[3].
*pi must point to one element beyond the end of the block.
It works with four fractions at a time, whereas the generic
crunch() works with three.
Running time: the inner loop runs in 50 cycles on a Pentium.
There's a stall here and there on more modern processors be-
cause you have to wait longer for multiplies to finish, although
I've tried to minimize those too. "length" is the number of
doubles before *pi that the loop will start from, and must
be negative. */
asm("
fldl
24(%1) #
r0
loads cache lines for
fldl
56(%1) #
r1 r0 each term
fldl
88(%1) #
r2 r1 r0
fldl
120(%1) #
r3 r2 r1 r0
.align 4
0:
fldl
_stuff #
B r3 r2 r1
r0 B = BASE
fmul %%st, %%st(4) #
B r3 r2 r1 R0
fldl
(%0,%2,8) # x0
B r3 r2 r1 R0
fmull
8(%1) #
nx0 B r3 r2
r1 R0
fxch
%%st(1) #
B nx0 r3 r2
r1 R0
fldl -8(%0,%2,8)
# x1 B nx0 r3
r2 r1 R0
fmull
40(%1) #
nx1 B nx0 r3 r2
r1 R0
fxch
%%st(6) #
R0 B nx0 r3
r2 r1 nx1
faddp %%st, %%st(2) #
B d0 r3 r2
r1 nx1
fmul %%st, %%st(4) #
B d0 r3 r2
R1 nx1
fldl -16(%0,%2,8) #
x2 B d0 r3
r2 R1 nx1
fmull
72(%1) #
nx2 B d0 r3
r2 R1 nx1
fxch
%%st(6) #
nx1 B d0 r3
r2 R1 nx2
faddp %%st, %%st(5) #
B d0 r3 r2
d1 nx2
fmul %%st, %%st(3) #
B d0 r3 R2
d1 nx2
fldl -24(%0,%2,8) #
x3 B d0 r3
R2 d1 nx2
fmull
104(%1) # nx3
B d0 r3 R2
d1 nx2
fxch
%%st(6) #
nx2 B d0 r3
R2 d1 nx3
faddp %%st, %%st(4) #
B d0 r3 d2
d1 nx3
fmulp %%st, %%st(2) # d0 R3 d2 d1 nx3
fld
%%st
# d0 d0 R3 d2
d1 nx3
fmull
(%1)
# q0 d0 R3 d2
d1 nx3
fxch
%%st(5) #
nx3 d0 R3 d2 d1 q0
faddp %%st, %%st(2) # d0 d3 d2 d1 q0
fld
%%st(3) #
d1 d0 d3 d2
d1 q0
fmull
32(%1) #
q1 d0 d3 d2
d1 q0
fxch
%%st(5) #
q0 d0 d3 d2
d1 q1
faddl
_stuff+8 # q0+
d0 d3 d2 d1 q1
fld
%%st(3) #
d2 q0+ d0 d3 d2
d1 q1
fmull
64(%1) #
q2 q0+ d0 d3 d2
d1 q1
fxch
%%st(6) #
q1 q0+ d0 d3 d2
d1 q2
faddl
_stuff+8 # q1+
q0+ d0 d3 d2 d1 q2
fld
%%st(3) #
d3 q1+ q0+ d0 d3
d2 d1 q2
fmull
96(%1) #
q3 q1+ q0+ d0 d3
d2 d1 q2
fxch
%%st(7) #
q2 q1+ q0+ d0 d3
d2 d1 q3
faddl
_stuff+8 # q2+
q1+ q0+ d0 d3 d2
d1 q3
fxch
%%st(2) #
q0+ q1+ q2+ d0 d3
d2 d1 q3
fsubl
_stuff+8 # q0-
q1+ q2+ d0 d3 d2
d1 q3
fxch
%%st(1) #
q1+ q0- q2+ d0 d3
d2 d1 q3
fsubl
_stuff+8 # q1-
q0- q2+ d0 d3 d2
d1 q3
fxch
%%st(7) #
q3 q0- q2+ d0 d3
d2 d1 q1-
faddl
_stuff+8 # q3+
q0- q2+ d0 d3 d2
d1 q1-
fxch
%%st(2) #
q2+ q0- q3+ d0 d3
d2 d1 q1-
fsubl
_stuff+8 # q2-
q0- q3+ d0 d3 d2
d1 q1-
fxch
%%st(1) #
q0- q2- q3+ d0 d3
d2 d1 q1-
fstl
(%0,%2,8) # q0-
q2- q3+ d0 d3 d2
d1 q1-
fmull
16(%1) #
nq0 q2- q3+ d0 d3
d2 d1 q1-
fxch
%%st(7) #
q1- q2- q3+ d0 d3
d2 d1 nq0
fstl -8(%0,%2,8)
# q1- q2- q3+ d0 d3
d2 d1 nq0
fmull
48(%1) #
nq1 q2- q3+ d0 d3
d2 d1 nq0
fxch
%%st(2) #
q3+ q2- nq1 d0 d3
d2 d1 nq0
fsubl
_stuff+8 # q3-
q2- nq1 d0 d3 d2
d1 nq0
fxch
%%st(1) #
q2- q3- nq1 d0 d3
d2 d1 nq0
fstl -16(%0,%2,8) #
q2- q3- nq1 d0 d3
d2 d1 nq0
fmull
80(%1) #
nq2 q3- nq1 d0 d3
d2 d1 nq0
fxch
%%st(7) #
nq0 q3- nq1 d0 d3
d2 d1 nq2
fsubrp %%st, %%st(3) # q3-
nq1 R0 d3 d2 d1
nq2
fstl -24(%0,%2,8) #
q3- nq1 R0 d3 d2
d1 nq2
fmull
112(%1) # nq3
nq1 R0 d3 d2 d1
nq2
fxch
%%st(1) #
nq1 nq3 R0 d3 d2
d1 nq2
fsubrp %%st, %%st(5) # nq3 R0 d3 d2 R1 nq2
fxch
%%st(5) #
nq2 R0 d3 d2 R1
nq3
fsubrp %%st, %%st(3) # R0 d3 R2 R1 nq3
fxch
%%st(4) #
nq3 d3 R2 R1
R0 remainders end up
fsubrp %%st, %%st(1) # R3
R2 R1
R0 in original
order
incl %2
jnz 0b
fstpl 120(%1) # r2 r1 r0
fstpl
88(%1) #
r1 r0
fstpl 56(%1) # r0
fstpl 24(%1)
":
: "r"(pi), "r"(t), "r"(length)
: "memory" );
}
#else
/*----------------------------- GENERIC NUMBER CRUNCHING ROUTINE -----*/
#define PIPELINE 3
void crunch( double *pi, long length, term *t ) {
/* multiply a block of the array pi by a trio of fractions
t[0],t[1],t[2]. *pi must point to one element beyond the
end of the block. For RISC machines one may get better
performance by mixing loads and stores with arithmetic
more thoroughly than is done here. "length" is the number of
doubles before *pi that the loop will start from (usually
-BLOCKPI), and must be negative. */
long j;
register double f0, f1, f2, f3, f4, f5;
f0 = t[0].rem;
f1 = t[1].rem;
f2 = t[2].rem;
for (j=length; j; j++) {
f3 = pi[j]; f3 *= t[0].num;
f4 = pi[j-1]; f4 *= t[1].num;
f5 = pi[j-2]; f5 *= t[2].num;
f0 *= stuff[0]; f1 *= stuff[0]; f2 *= stuff[0];
f0 +=
f3; f1 +=
f4; f2 += f5;
f3 = f0; f3 *= t[0].recip;
f4 = f1; f4 *= t[1].recip;
f5 = f2; f5 *= t[2].recip;
f3 += stuff[1]; f4 += stuff[1]; f5 += stuff[1];
f3 -= stuff[1]; f4 -= stuff[1]; f5 -= stuff[1];
pi[j] = f3; f3 *= t[0].den;
pi[j-1] = f4; f4 *= t[1].den;
pi[j-2] = f5; f5 *= t[2].den;
f0 -= f3; f1 -= f4; f2 -= f5;
}
t[0].rem = f0;
t[1].rem = f1;
t[2].rem = f2;
}
/*---------------------------------------------------------------*/
#endif
void crunch_one( double *pi, long length, term *t ) {
/* multiply "length" doubles of the array *pi by the
single fraction that *t points to. *pi must be one
element beyond the end of the array. Note that this
code is very inefficient and should be used sparingly;
the pipelined crunch() routines are 2-2.5 times faster. */
long i;
register double f0, f1;
f0 = (*t).rem;
for (i=length; i; i++) {
f0 *= stuff[0];
f1 = pi[i]; f1 *= (*t).num;
f0 += f1; f1 = f0; f1 *= (*t).recip;
f1 += stuff[1];
f1 -= stuff[1];
pi[i] = f1;
f1 *= (*t).den; f0 -= f1;
}
(*t).rem = f0;
}
/*---------------------------------------------------------------*/
void blockmultiply( double *pi, long length,
term *t, long tlength ) {
/* Multiply a block of doubles "*pi" by a block of
fractions "*t". *pi and *t must point to one element
beyond the end of their respective blocks. "length"
and "tlength" denote the size of the *pi and *t arrays,
respectively, and must be negative. This is the program's
major workhorse routine, and multiplies by PIPELINE
fractions at a time. */
long i, j;
for (i=tlength; i; i+=PIPELINE) {
/* set up for pipelined multiplication. crunch_one
is used to move some remainders farther forward
than others, so that several array elements in pi[]
are all in different stages and can all be worked
on
simultaneuously.
*/
for (j=PIPELINE-1; j; j--)
crunch_one( &pi[length+j], -j, &t[i+PIPELINE-1-j] );
/* switch to the much faster multiply routine. If
the block size BLOCKPI is large then the overhead
of using crunch_one
becomes
negligible.
*/
crunch( pi, length+PIPELINE-1, &t[i] );
/* re-align all the remainders again, using crunch_one.
This lets you stop crunching out the present set
of fractions and pick up the computation later, in
the next block. Other computations on this block
can be performed, while the block is still in cache. */
for (j=1; j<PIPELINE; j++)
crunch_one( pi, -j, &t[i+j] );
}
}
/*---------------------------------------------------------------*/
void make_terms( double *pi, term *t, long number ) {
/* fill up *t with "number" fractions and use them
on pi[0]. *t points to the beginning of the term
array. */
long i, j;
register double f0, f1;
for(i=0; i<number; i+=FRACTIONS_PER_TERM) {
*pi += stuff[2];
stuff[2] -=
stuff[3];
/* add on next constant */
for(j=0; j<FRACTIONS_PER_TERM; j++) {
t[i+j].num = stuff [4*j+6];
t[i+j].den = stuff
[4*j+8];
/* create next fraction */
t[i+j].recip = 1.0 / t[i+j].den;
stuff [4*j+6] -= stuff [4*j+7];
stuff [4*j+8] -= stuff [4*j+9];
f0 = *pi;
f0 *= t[i+j].num;
f1 =
f0;
/* multiply pi[0] by it */
f1 *= t[i+j].recip;
f1 += stuff[1];
f1 -= stuff[1];
*pi = f1;
f1 *= t[i+j].den;
t[i+j].rem = f0 - f1;
}
}
}
/*-----------------------------------------------------------------------*/
void printout( double *x, long size ) { /* print out array of digits */
long i, j, line_number = 16-BASEDIGITS;
char digits[40], *decimalpoint;
/* this function is complicated because the size of
the "chunk" of digits printed is determined at
compile time, based on choice and the machine being
used. */
printf("%ld.\n", (long)x[0]);
for(i=1; i<size; i++) {
for(j=0; j<40; j++) digits[j] =
'0'; /* make array of zeros */
sprintf(&digits[14],"%f",
x[i]);
/* print x[i] into it */
decimalpoint = strchr( digits, '.'
); /* find the decimal pt */
*decimalpoint =
0;
/* make it end of str */
printf("%s ", decimalpoint - BASEDIGITS); /* print specified no. */
if( i%line_number == 0 )
printf("\n"); /* of digits backward
*/
}
}
/*------------------------------------------------- MAIN ---------------*/
int main(int argc, char *argv[]) {
double *x; term *y;
long digits, number, blocks, blockpiece,
i, j, k, termblocks, termpiece;
clock_t start, stop;
term scale;
if( argc!=2 ) {
printf("\nusage: blokpi NumberOfDigits > OutputFile");
printf("\n\n");
exit(-1);
}
digits=atol(argv[1]);
number=digits/CORRECT_DIGITS_PER_TERM + 3;
digits = digits/BASEDIGITS +
1;
/* figure out how many */
if( digits % BLOCKPI < 6
)
/* words are needed, and */
digits += 6 - digits %
BLOCKPI; /* make
sure there are */
/* enough for pipelining */
number *= FRACTIONS_PER_TERM;
while( (number % BLOCKTERM) % PIPELINE ) /* do the same for the */
number +=
FRACTIONS_PER_TERM;
/* number of terms. */
number /= FRACTIONS_PER_TERM;
x = (double *)calloc( digits, sizeof(double) ); /* allocate memory */
y = (term *)calloc( BLOCKTERM, sizeof(term) );
stuff[0] =
BASE;
/* initialize the array */
stuff[1] =
ROUND;
/* of auxiliary stuff */
stuff[2] += (number-1) * stuff[3];
for(j=0; j<FRACTIONS_PER_TERM; j++) {
stuff[4*j+6] += (number-1) * stuff[4*j+7];
stuff[4*j+8] += (number-1) * stuff[4*j+9];
}
/* chop up the problem */
blocks = digits /
BLOCKPI;
/* into blocks */
blockpiece = digits % BLOCKPI;
termblocks = (number * FRACTIONS_PER_TERM) / BLOCKTERM;
termpiece = (number * FRACTIONS_PER_TERM) % BLOCKTERM;
/* what follows is an awful mess, but basically the
program creates all the fractions a block at a
time, and multiplies the answer (a block at a time)
by all the fractions at once. It's complicated
because neither the data nor the fractions are an
integral number of blocks in size, and this has
to be accounted for. */
start = clock();
for(i=0; i<termblocks; i++)
{
make_terms( x, y, BLOCKTERM );
for( j=0,k=BLOCKPI; j<blocks; j++,k+=BLOCKPI ) {
blockmultiply( &x[k+1], -BLOCKPI,
&y[BLOCKTERM], -BLOCKTERM );
}
blockmultiply( &x[digits], -blockpiece+1,
&y[BLOCKTERM], -BLOCKTERM );
}
make_terms( x, y, termpiece );
for( j=0,k=BLOCKPI; j<blocks; j++,k+=BLOCKPI ) {
blockmultiply( &x[k+1], -BLOCKPI,
&y[termpiece], -termpiece );
}
blockmultiply( &x[digits], -blockpiece+1,
&y[termpiece], -termpiece );
x[0] +=
stuff[2];
/* we're mostly done now. By */
scale.num =
stuff[4];
/* this point all that's left */
scale.den =
stuff[5];
/* is to scale the answer */
scale.recip = 1.0 / stuff[5];
scale.rem = 0.0;
crunch_one( &x[digits], -digits, &scale );
stop = clock();
for(i=digits-1; i; i--)
{
/* release borrows (i.e. make */
while(x[i]<0)
{
/* all the digits positive) */
x[i]+=BASE;
x[i-1]--;
}
while(x[i]>=BASE) {
x[i]-=BASE;
x[i-1]++;
}
}
printout(x, digits);
printf( "\n\nelapsed time: %lf s\n",
(double)(stop-start)/(double)CLOCKS_PER_SEC);
free(x);
free(y);
/* whew! */
}