/*----------------------------------------------------------------------
blokpi.c: calculate pi v1.0

This code is for a very fast pi calculator that adds up series with
a certain special form. It uses block summation (see the blokpi.txt
for what this means and how it works) and can be configured at com-
pile time for optimal pi-crunching speed across different platforms.
I designed it to be compiled using djgpp on the Pentium processor
(since that's what I have), and it runs fastest for these, but the
code involved is generic enough so that any ANSI C compiler will do.
Another plus is that the series added up can be changed with a min-
imum of effort, so this program can *also* add up similar series
for other constants, also at breakneck speed.

This program is placed in the public domain by its author, Jason
Papadopoulos. Do whatever the heck you want with it, but if you
figure out a way to speed it up, please be kind and send an e-mail
to jasonp@glue.umd.edu
--------------------------------------------------------------------*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

/* compile for Intel FPU  */
#define INTEL          

/* number of doubles in 1 block */
#define BLOCKPI 1000

/* number of fractions in 1 block.
   If GCCDOS is defined this must
   be a multiple of 4; otherwise it
   must be a multiple of 3. To be
   really general, make it a mul-
   tiple of 12). NOTE: it must also
   be a multiple of FRACTIONS_PER_TERM  */
#define BLOCKTERM 252

/* special high-performance code is
   available when compiled using djgpp
   for DOS. Runs about 18% faster.     */
#define GCCDOS                       

                                    /* specifics of the formula used */
#define FRACTIONS_PER_TERM 2
#define CORRECT_DIGITS_PER_TERM 1.4314

double stuff[14] = { 0, 0,       /* don't mess with these; filled later */
                   13,           /* 1st additive constant; req'd        */
                   7,            /* increment for 1st additive constant */
                   8,            /* numerator of scaling constant       */
                   15,           /* denominator of scaling constant     */
                   2,            /* 1st numerator                       */
                   2,            /* increment for 1st numerator         */
                   11,           /* 1st denominator                     */
                   6,            /* increment for 1st denominator       */
                   1,            /* 2nd numerator                       */
                   2,            /* increment for 2nd numerator         */
                   -21,          /* 2nd denominator                     */
                   -18};         /* increment for 2nd denominator       */

                                 /*  ...add more fractions if needed... */

/* Pick the base you want to work in, i.e.
   how many digits get stored in a double.
   For Intel registers, BASEDIGITS + (max
   number of digits in num and denom of
   fractions) = 18. For anything else the
   sum is 14 (the IEEE guarantees at least
   that much). The more terms you want to
   add up the smaller BASE must be. 10 is
   a good default for Intel stuff */
#define BASE 1.0e10
#define BASEDIGITS 10


/******************* CODE STARTS HERE ***********************************/

typedef struct {
   double recip;
   double num;
   double den;
   double rem;
} term;

#ifdef INTEL
  #define ROUND 3.0 * 2147483648.0 * 2147483648.0        /* 3*2^62 */
#else
  #define ROUND 3.0 * 67108864.0 * 67108864.0            /* 3*2^52 */
#endif

#ifdef GCCDOS
/*-----------------------ASSEMBLY CODED NUMBER CRUNCHING ROUTINE -----*/
#define PIPELINE 4
void crunch( double *pi, long length, term *t ) {

   /* High-performance djgpp code that multiplies a block of the
      array pi by a quartet of fractions t[0],t[1],t[2],t[3].
      *pi must point to one element beyond the end of the block.
      It works with four fractions at a time, whereas the generic
      crunch() works with three.

      Running time: the inner loop runs in 50 cycles on a Pentium.
      There's a stall here and there on more modern processors be-
      cause you have to wait longer for multiplies to finish, although
      I've tried to minimize those too. "length" is the number of
      doubles before *pi that the loop will start from, and must
      be negative. */
    
   asm("
     fldl 24(%1)           # r0             loads cache lines for
     fldl 56(%1)           # r1   r0        each term
     fldl 88(%1)           # r2   r1   r0
     fldl 120(%1)          # r3   r2   r1   r0

    .align 4
    0:

     fldl _stuff           # B    r3   r2   r1   r0        B = BASE
     fmul %%st, %%st(4)    # B    r3   r2   r1   R0
     fldl (%0,%2,8)        # x0   B    r3   r2   r1   R0
     fmull 8(%1)           # nx0  B    r3   r2   r1   R0
     fxch %%st(1)          # B    nx0  r3   r2   r1   R0
     fldl -8(%0,%2,8)      # x1   B    nx0  r3   r2   r1   R0
     fmull 40(%1)          # nx1  B    nx0  r3   r2   r1   R0
     fxch %%st(6)          # R0   B    nx0  r3   r2   r1   nx1
     faddp %%st, %%st(2)   # B    d0   r3   r2   r1   nx1
     fmul %%st, %%st(4)    # B    d0   r3   r2   R1   nx1
     fldl -16(%0,%2,8)     # x2   B    d0   r3   r2   R1   nx1
     fmull 72(%1)          # nx2  B    d0   r3   r2   R1   nx1
     fxch %%st(6)          # nx1  B    d0   r3   r2   R1   nx2
     faddp %%st, %%st(5)   # B    d0   r3   r2   d1   nx2
     fmul %%st, %%st(3)    # B    d0   r3   R2   d1   nx2
     fldl -24(%0,%2,8)     # x3   B    d0   r3   R2   d1   nx2
     fmull 104(%1)         # nx3  B    d0   r3   R2   d1   nx2
     fxch %%st(6)          # nx2  B    d0   r3   R2   d1   nx3
     faddp %%st, %%st(4)   # B    d0   r3   d2   d1   nx3
     fmulp %%st, %%st(2)   # d0   R3   d2   d1   nx3
     fld %%st              # d0   d0   R3   d2   d1   nx3
     fmull (%1)            # q0   d0   R3   d2   d1   nx3
     fxch %%st(5)          # nx3  d0   R3   d2   d1   q0
     faddp %%st, %%st(2)   # d0   d3   d2   d1   q0
     fld %%st(3)           # d1   d0   d3   d2   d1   q0
     fmull 32(%1)          # q1   d0   d3   d2   d1   q0
     fxch %%st(5)          # q0   d0   d3   d2   d1   q1
     faddl _stuff+8        # q0+  d0   d3   d2   d1   q1
     fld %%st(3)           # d2   q0+  d0   d3   d2   d1   q1
     fmull 64(%1)          # q2   q0+  d0   d3   d2   d1   q1
     fxch %%st(6)          # q1   q0+  d0   d3   d2   d1   q2
     faddl _stuff+8        # q1+  q0+  d0   d3   d2   d1   q2
     fld %%st(3)           # d3   q1+  q0+  d0   d3   d2   d1   q2
     fmull 96(%1)          # q3   q1+  q0+  d0   d3   d2   d1   q2
     fxch %%st(7)          # q2   q1+  q0+  d0   d3   d2   d1   q3
     faddl _stuff+8        # q2+  q1+  q0+  d0   d3   d2   d1   q3
     fxch %%st(2)          # q0+  q1+  q2+  d0   d3   d2   d1   q3
     fsubl _stuff+8        # q0-  q1+  q2+  d0   d3   d2   d1   q3
     fxch %%st(1)          # q1+  q0-  q2+  d0   d3   d2   d1   q3
     fsubl _stuff+8        # q1-  q0-  q2+  d0   d3   d2   d1   q3
     fxch %%st(7)          # q3   q0-  q2+  d0   d3   d2   d1   q1-
     faddl _stuff+8        # q3+  q0-  q2+  d0   d3   d2   d1   q1-
     fxch %%st(2)          # q2+  q0-  q3+  d0   d3   d2   d1   q1-
     fsubl _stuff+8        # q2-  q0-  q3+  d0   d3   d2   d1   q1-
     fxch %%st(1)          # q0-  q2-  q3+  d0   d3   d2   d1   q1-
     fstl (%0,%2,8)        # q0-  q2-  q3+  d0   d3   d2   d1   q1-
     fmull 16(%1)          # nq0  q2-  q3+  d0   d3   d2   d1   q1-
     fxch %%st(7)          # q1-  q2-  q3+  d0   d3   d2   d1   nq0
     fstl -8(%0,%2,8)      # q1-  q2-  q3+  d0   d3   d2   d1   nq0
     fmull 48(%1)          # nq1  q2-  q3+  d0   d3   d2   d1   nq0
     fxch %%st(2)          # q3+  q2-  nq1  d0   d3   d2   d1   nq0
     fsubl _stuff+8        # q3-  q2-  nq1  d0   d3   d2   d1   nq0
     fxch %%st(1)          # q2-  q3-  nq1  d0   d3   d2   d1   nq0
     fstl -16(%0,%2,8)     # q2-  q3-  nq1  d0   d3   d2   d1   nq0
     fmull 80(%1)          # nq2  q3-  nq1  d0   d3   d2   d1   nq0
     fxch %%st(7)          # nq0  q3-  nq1  d0   d3   d2   d1   nq2
     fsubrp %%st, %%st(3)  # q3-  nq1  R0   d3   d2   d1   nq2
     fstl -24(%0,%2,8)     # q3-  nq1  R0   d3   d2   d1   nq2
     fmull 112(%1)         # nq3  nq1  R0   d3   d2   d1   nq2
     fxch %%st(1)          # nq1  nq3  R0   d3   d2   d1   nq2
     fsubrp %%st, %%st(5)  # nq3  R0   d3   d2   R1   nq2
     fxch %%st(5)          # nq2  R0   d3   d2   R1   nq3
     fsubrp %%st, %%st(3)  # R0   d3   R2   R1   nq3
     fxch %%st(4)          # nq3  d3   R2   R1   R0     remainders end up
     fsubrp %%st, %%st(1)  # R3   R2   R1   R0          in original order

     incl %2
     jnz 0b

     fstpl 120(%1)         # r2   r1   r0
     fstpl 88(%1)          # r1   r0   
     fstpl 56(%1)          # r0
     fstpl 24(%1)
   ":
    : "r"(pi), "r"(t), "r"(length)
    : "memory" );
}

#else
/*----------------------------- GENERIC NUMBER CRUNCHING ROUTINE -----*/

#define PIPELINE 3
void crunch( double *pi, long length, term *t ) {

   /* multiply a block of the array pi by a trio of fractions
      t[0],t[1],t[2]. *pi must point to one element beyond the
      end of the block. For RISC machines one may get better
      performance by mixing loads and stores with arithmetic
      more thoroughly than is done here. "length" is the number of
      doubles before *pi that the loop will start from (usually
      -BLOCKPI), and must be negative. */
    
   long j;
   register double f0, f1, f2, f3, f4, f5;

   f0 = t[0].rem;
   f1 = t[1].rem;
   f2 = t[2].rem;

   for (j=length; j; j++) {

      f3 = pi[j];     f3 *= t[0].num;
      f4 = pi[j-1];   f4 *= t[1].num;
      f5 = pi[j-2];   f5 *= t[2].num;
        
      f0 *= stuff[0];  f1 *= stuff[0];  f2 *= stuff[0];
      f0 += f3;        f1 += f4;        f2 += f5;

      f3 = f0;    f3 *= t[0].recip;
      f4 = f1;    f4 *= t[1].recip;
      f5 = f2;    f5 *= t[2].recip;

      f3 += stuff[1];  f4 += stuff[1];  f5 += stuff[1];
      f3 -= stuff[1];  f4 -= stuff[1];  f5 -= stuff[1];

      pi[j] = f3;     f3 *= t[0].den;
      pi[j-1] = f4;   f4 *= t[1].den;
      pi[j-2] = f5;   f5 *= t[2].den;

      f0 -= f3;   f1 -= f4;   f2 -= f5;
   }

   t[0].rem = f0;
   t[1].rem = f1;
   t[2].rem = f2;

}

/*---------------------------------------------------------------*/
#endif

void crunch_one( double *pi, long length, term *t ) {

   /* multiply "length" doubles of the array *pi by the
      single fraction that *t points to. *pi must be one
      element beyond the end of the array. Note that this
      code is very inefficient and should be used sparingly;
      the pipelined crunch() routines are 2-2.5 times faster. */

   long i;
   register double f0, f1;

   f0 = (*t).rem;

   for (i=length; i; i++) {
      f0 *= stuff[0];
      f1 = pi[i];            f1 *= (*t).num;
      f0 += f1;   f1 = f0;   f1 *= (*t).recip;
      f1 += stuff[1];
      f1 -= stuff[1];
      pi[i] = f1;
      f1 *= (*t).den;   f0 -= f1;
   }

   (*t).rem = f0;
}

/*---------------------------------------------------------------*/
void blockmultiply( double *pi, long length,
                    term *t, long tlength ) {

   /* Multiply a block of doubles "*pi" by a block of
      fractions "*t". *pi and *t must point to one element
      beyond the end of their respective blocks. "length"
      and "tlength" denote the size of the *pi and *t arrays,
      respectively, and must be negative. This is the program's
      major workhorse routine, and multiplies by PIPELINE
      fractions at a time. */

   long i, j;

   for (i=tlength; i; i+=PIPELINE) {
    
      /* set up for pipelined multiplication. crunch_one
         is used to move some remainders farther forward
         than others, so that several array elements in pi[]
         are all in different stages and can all be worked
         on simultaneuously.                               */

      for (j=PIPELINE-1; j; j--)
         crunch_one( &pi[length+j], -j, &t[i+PIPELINE-1-j] );

      /* switch to the much faster multiply routine. If
         the block size BLOCKPI is large then the overhead
         of using crunch_one becomes negligible.            */

      crunch( pi, length+PIPELINE-1, &t[i] );

      /* re-align all the remainders again, using crunch_one.
         This lets you stop crunching out the present set
         of fractions and pick up the computation later, in
         the next block. Other computations on this block
         can be performed, while the block is still in cache. */

      for (j=1; j<PIPELINE; j++)
         crunch_one( pi, -j, &t[i+j] );
   }
}

/*---------------------------------------------------------------*/
void make_terms( double *pi, term *t, long number ) {

   /* fill up *t with "number" fractions and use them
      on pi[0]. *t points to the beginning of the term
      array.   */

   long i, j;
   register double f0, f1;

   for(i=0; i<number; i+=FRACTIONS_PER_TERM) {
      *pi += stuff[2];
      stuff[2] -= stuff[3];                     /* add on next constant */

      for(j=0; j<FRACTIONS_PER_TERM; j++) {
         t[i+j].num = stuff [4*j+6];
         t[i+j].den = stuff [4*j+8];            /* create next fraction */
         t[i+j].recip = 1.0 / t[i+j].den;
         stuff [4*j+6] -= stuff [4*j+7];
         stuff [4*j+8] -= stuff [4*j+9];

         f0 = *pi;
         f0 *= t[i+j].num;
         f1 = f0;                               /* multiply pi[0] by it */
         f1 *= t[i+j].recip;
         f1 += stuff[1];
         f1 -= stuff[1];
         *pi = f1;
         f1 *= t[i+j].den;
         t[i+j].rem = f0 - f1;
      }
   }
}

/*-----------------------------------------------------------------------*/
void printout( double *x, long size ) {     /* print out array of digits */
   long i, j, line_number = 16-BASEDIGITS;
   char digits[40], *decimalpoint;

   /* this function is complicated because the size of
      the "chunk" of digits printed is determined at
      compile time, based on choice and the machine being
      used. */
    
   printf("%ld.\n", (long)x[0]);

   for(i=1; i<size; i++) {

      for(j=0; j<40; j++) digits[j] = '0';        /* make array of zeros */

      sprintf(&digits[14],"%f", x[i]);            /* print x[i] into it  */
      decimalpoint = strchr( digits, '.' );       /* find the decimal pt */
      *decimalpoint = 0;                          /* make it end of str  */

      printf("%s ", decimalpoint - BASEDIGITS);   /* print specified no. */
      if( i%line_number == 0 ) printf("\n");      /* of digits backward  */
   }
}

/*-------------------------------------------------  MAIN ---------------*/
int main(int argc, char *argv[]) {
   double *x; term *y;
   long digits, number, blocks, blockpiece,
        i, j, k, termblocks, termpiece;
   clock_t start, stop;
   term scale;

   if( argc!=2 ) {
      printf("\nusage: blokpi  NumberOfDigits > OutputFile");
      printf("\n\n");
      exit(-1);
   }

   digits=atol(argv[1]);
   number=digits/CORRECT_DIGITS_PER_TERM + 3;

   digits = digits/BASEDIGITS + 1;             /* figure out how many   */
   if( digits % BLOCKPI < 6 )                  /* words are needed, and */
      digits += 6 - digits % BLOCKPI;          /* make sure there are   */
                                               /* enough for pipelining */
   number *= FRACTIONS_PER_TERM;
   while( (number % BLOCKTERM) % PIPELINE )    /* do the same for the   */
      number += FRACTIONS_PER_TERM;            /* number of terms.      */
   number /= FRACTIONS_PER_TERM;

   x = (double *)calloc( digits, sizeof(double) );   /* allocate memory */
   y = (term *)calloc( BLOCKTERM, sizeof(term) );

   stuff[0] = BASE;                             /* initialize the array */
   stuff[1] = ROUND;                            /* of auxiliary stuff   */
   stuff[2] += (number-1) * stuff[3];
   for(j=0; j<FRACTIONS_PER_TERM; j++) {
      stuff[4*j+6] += (number-1) * stuff[4*j+7];
      stuff[4*j+8] += (number-1) * stuff[4*j+9];
   }
                                                /* chop up the problem  */
   blocks = digits / BLOCKPI;                   /* into blocks          */
   blockpiece = digits % BLOCKPI;
   termblocks = (number * FRACTIONS_PER_TERM) / BLOCKTERM;
   termpiece = (number * FRACTIONS_PER_TERM) % BLOCKTERM;

   /* what follows is an awful mess, but basically the
      program creates all the fractions a block at a
      time, and multiplies the answer (a block at a time)
      by all the fractions at once. It's complicated
      because neither the data nor the fractions are an
      integral number of blocks in size, and this has
      to be accounted for. */
 
   start = clock();
   for(i=0; i<termblocks; i++) {                     
      make_terms( x, y, BLOCKTERM );
      for( j=0,k=BLOCKPI; j<blocks; j++,k+=BLOCKPI ) {
         blockmultiply( &x[k+1], -BLOCKPI,
                        &y[BLOCKTERM], -BLOCKTERM );
      }
      blockmultiply( &x[digits], -blockpiece+1,
                     &y[BLOCKTERM], -BLOCKTERM );
   }

   make_terms( x, y, termpiece );
   for( j=0,k=BLOCKPI; j<blocks; j++,k+=BLOCKPI ) {
      blockmultiply( &x[k+1], -BLOCKPI,
                     &y[termpiece], -termpiece );
   }
   blockmultiply( &x[digits], -blockpiece+1,
                  &y[termpiece], -termpiece );


   x[0] += stuff[2];                       /* we're mostly done now. By  */
   scale.num = stuff[4];                   /* this point all that's left */
   scale.den = stuff[5];                   /* is to scale the answer     */
   scale.recip = 1.0 / stuff[5];
   scale.rem = 0.0;
   crunch_one( &x[digits], -digits, &scale );
   stop = clock();

   for(i=digits-1; i; i--) {               /* release borrows (i.e. make */
      while(x[i]<0) {                      /* all the digits positive)   */
         x[i]+=BASE;
         x[i-1]--;
      }
      while(x[i]>=BASE) {
         x[i]-=BASE;
         x[i-1]++;
      }
   }

   printout(x, digits);
   printf( "\n\nelapsed time: %lf s\n",
      (double)(stop-start)/(double)CLOCKS_PER_SEC);

   free(x); free(y);                                            /* whew! */
}