#include "camm_util.h"
#include "camm_arith.h"

#define pf(a_,b_)  /*  f(nta,a_,b_) */

#define K1_10(a_,b_) \
      pla(a_,ax,1) \
      pla(a_,bx,0) \
      pf(S(a_,b_),ax) \
      pm(0,1) \
      pa(1,4) \
      pla(S(a_,KB4),ax,2) \
      pm(0,2) \
      pa(2,5) \
      pf(S(S(a_,b_),KB4),ax) \
      pla(S(S(a_,KB4),KB4),ax,1) \
      pm(0,1) \
      pa(1,6) \
      pla(S(S(S(a_,KB4),KB4),KB4),ax,2) \
      pm(2,0) \
      pa(0,7) 

#define K1_2_10(a_) px(0) \
                    pld(a_,bx,0)  \
                    px(1) \
                    pld(a_,ax,1) \
                    pm(0,1) \
                    pa(1,4) \
                    px(1) \
                    pld(S(a_,KB4),ax,1) \
                    pm(0,1) \
                    pa(1,5) \
                    px(1) \
                    pld(S(S(a_,KB4),KB4),ax,1) \
                    pm(0,1) \
                    pa(1,6) \
                    px(1) \
                    pld(S(S(S(a_,KB4),KB4),KB4),ax,1) \
                    pm(0,1) \
                    pa(1,7) 
                    

#define K1_4_10(a_) pls(a_,bx,0)  \
                    pls(a_,ax,1) \
                    pmsr(0,1) \
                    pasr(1,4) \
                    pls(S(a_,KB4),ax,1) \
                    pmsr(0,1) \
                    pasr(1,5) \
                    pls(S(S(a_,KB4),KB4),ax,1) \
                    pmsr(0,1) \
                    pasr(1,6) \
                    pls(S(S(S(a_,KB4),KB4),KB4),ax,1) \
                    pmsr(1,0) \
                    pasr(0,7) 

#define LK10(a_) 
#define DK10(a_) K1_10(a_,a_)

#define NN 10

#define K1(a_,b_)      Mjoin(K1_,NN)(a_,b_)
#define K1_2(a_)       Mjoin(K1_2_,NN)(a_)
#define K1_4(a_)       Mjoin(K1_4_,NN)(a_)
#define Load_Kipe(a_)  Mjoin(LK,NN)(a_)
#define Drain_Kipe(a_) Mjoin(DK,NN)(a_)
      


#define K2(a_) pf(S(a_,80),bx) K1(a_,80) K1(S(a_,16),S(S(64,KB4),KB4))
#define K4(a_) K2(a_) K2(S(a_,32))
#define K8(a_) K4(a_) K4(S(a_,64))
#define K16(a_) K8(a_) K8(S(a_,128))

#if KB >= 128
#error KB must be less than 128
#endif

#define y0 Load_Kipe(0)
#define f0 0

#define KBB ( KB - 4 )

#if KBB >= 64
#define y1  y0 K16(f0)
#define f1 S(256,f0)
#else
#define y1 y0
#define f1 f0
#endif

#if ( KBB / 32 ) % 2
#define y2  y1 K8(f1)
#define f2 S(128,f1)
#else
#define y2 y1
#define f2 f1
#endif

#if ( KBB / 16 ) % 2
#define y3  y2 K4(f2)
#define f3 S(64,f2)
#else
#define y3 y2
#define f3 f2
#endif

/*  #define y4 y3 Drain_Kipe(f3) */
/*  #define f4 S(64,f3) */

#if ( KBB / 8 ) % 2
#define y4  y3 K2(f3)
#define f4 S(32,f3)
#else
#define y4 y3
#define f4 f3
#endif

/*  #define y5 y4 Drain_Kipe(f4) */
/*  #define f5 f4 */

#if ( KBB / 4 ) % 2
#define y5  y4 K1(f4,f4)
#define f5 S(16,f4)
#else
#define y5 y4
#define f5 f4
#endif

#define y6 y5 Drain_Kipe(f5)
#define f6 S(16,f5)

#if ( KB / 2 ) % 2
#define y7  y6 K1_2(f6)
#define f7 S(8,f6)
#else
#define y7 y6
#define f7 f6
#endif

#if ( KB / 1 ) % 2 
#define y8 y7 K1_4(f7)
#define f8 S(4,f7)
#else
#define y8 y7
#define f8 f7
#endif

#ifdef SREAL
#define CS 4
#else
#define CS 8
#endif

#define FF "prefetcht0 (%%ecx)\n\t"
#ifdef SREAL
#define Z FF pc(4,0) pul(5,4) pc(6,1) puh(5,0) pul(7,6)  \
          pa(0,4) puh(7,1) pc(4,2) pa(1,6) ps(68,6,4) ps(238,6,2) pa(4,2) pu(2,0,cx)
#else
#define Z1(a_,b_) phl(a_,b_) pa(b_,a_) pc(a_,b_) ps(1,b_,b_) pasr(b_,a_)
#define Z    Z1(4,0) pus(4,0,cx) Z1(5,1) pus(5,CS,cx) \
             Z1(6,2) pus(6,S(CS,CS),cx) Z1(7,0) pus(7,S(S(CS,CS),CS),cx)
#endif

#ifdef BETA0
#define W    px(4) px(5) px(6) px(7)
#endif
#ifdef BETA1
#define W    pls(0,cx,4) pls(CS,cx,5) pls(S(CS,CS),cx,6) pls(S(S(CS,CS),CS),cx,7)
#endif
#ifdef BETAX
#define W    pls(0,cx,4) pmsr(3,4) pls(CS,cx,5) pmsr(3,5) pls(S(CS,CS),cx,6) pmsr(3,6) \
             pls(S(S(CS,CS),CS),cx,7) pmsr(3,7)
#endif


#define p1_10(a_) \
      pla(a_,ax,1) \
      pla(a_,bx,0) \
      pm(0,1) \
      pa(1,4) 

#define lp10(a_) 
#define dp10(a_) p1_10(a_)
#define p1_2_10(a_) px(0) \
                    pld(a_,bx,0)  \
                    px(1) \
                    pld(a_,ax,1) \
                    pm(1,0) \
                    pa(0,4) 

#define p1_4_10(a_) pls(a_,bx,0)  \
                    pls(a_,ax,1) \
                    pmsr(1,0) \
                    pasr(0,4) 

#define p1_11(a_) \
      pl(a_,ax,1) \
      pl(a_,bx,0) \
      pm(0,1) \
      pa(1,4) 

#define p2_11(a_) \
      pl(a_,ax,3) \
      pl(a_,bx,2) \
      pm(0,1) \
      pa(1,4) \
      pl(S(a_,16),ax,1) \
      pl(S(a_,16),bx,0) \
      pm(2,3) \
      pa(3,4) 

#define lp11(a_) pl(a_,ax,1) pl(a_,bx,0)
#define dp11(a_) pm(0,1) pa(1,4)
#define p1_2_11(a_) px(0) \
                    pld(a_,bx,0)  \
                    px(1) \
                    pld(a_,ax,1) \
                    pm(1,0) \
                    pa(0,4) 

#define p1_4_11(a_) pls(a_,bx,0)  \
                    pls(a_,ax,1) \
                    pmsr(1,0) \
                    pasr(0,4) 


#define p1_12(a_) \
      pl(a_,ax,1) \
      pl(a_,bx,0) \
      pm(0,1) \
      pa(1,4) 

#define p4_12(a_) \
      pl(S(a_,32),bx,0) \
      pl(S(a_,16),ax,5) \
      pm(6,2) \
      pa(2,4) \
      pl(S(a_,48),bx,1) \
      pl(S(a_,32),ax,6) \
      pm(5,3) \
      pa(3,4) \
      pl(S(a_,64),bx,2) \
      pl(S(a_,48),ax,5) \
      pm(6,0) \
      pa(0,4) \
      pl(S(a_,80),bx,3) \
      pl(S(a_,64),ax,6) \
      pm(5,1) \
      pa(1,4) 

#define lp12(a_) pl(a_,bx,2) pl(a_,ax,6) pl(S(a_,16),bx,3)
#define dp12(a_) \
      pl(S(a_,32),bx,0) \
      pl(S(a_,16),ax,5) \
      pm(6,2) \
      pa(2,4) \
      pl(S(a_,48),bx,1) \
      pl(S(a_,32),ax,6) \
      pm(5,3) \
      pa(3,4) \
      pl(S(a_,48),ax,5) \
      pm(6,0) \
      pa(0,4) \
      pm(5,1) \
      pa(1,4) 

#define p1_2_12(a_) px(0) \
                    pld(a_,bx,0)  \
                    px(1) \
                    pld(a_,ax,1) \
                    pm(1,0) \
                    pa(0,4) 

#define p1_4_12(a_) pls(a_,bx,0)  \
                    pls(a_,ax,1) \
                    pmsr(1,0) \
                    pasr(0,4) 





#define N 10

#define p1_4(a_) Mjoin(p1_4_,N)(a_)
#define p1_2(a_) Mjoin(p1_2_,N)(a_)
#define p1(a_)   Mjoin(p1_,N)(a_)
/*  #define p4(a_)   Mjoin(p4_,N)(a_) */
#define load_pipe(a_) Mjoin(lp,N)(a_)
#define drain_pipe(a_) Mjoin(dp,N)(a_)
      


#define p2(a_) pf(S(a_,32),bx) p1(a_) pf(S(a_,32),ax) p1(S(a_,16))
#define p4(a_) p2(a_) p2(S(a_,32))
#define p8(a_) p4(a_) p4(S(a_,64))
#define p16(a_) p8(a_) p8(S(a_,128))

#if KB >= 128
#error KB must be less than 128
#endif

#define x0 load_pipe(0)
#define o0 0

#ifdef KBB
   #undef KBB
#endif
#define KBB ( KB - 0 )

#if KBB >= 64
#define x1  x0 p16(o0)
#define o1 S(256,o0)
#else
#define x1 x0
#define o1 o0
#endif

#if ( KBB / 32 ) % 2
#define x2  x1 p8(o1)
#define o2 S(128,o1)
#else
#define x2 x1
#define o2 o1
#endif

#if ( KBB / 16 ) % 2
#define x3  x2 p4(o2)
#define o3 S(64,o2)
#else
#define x3 x2
#define o3 o2
#endif

/*  #define x4 x3 drain_pipe(o3) */
/*  #define o4 S(64,o3) */
#define x4 x3
#define o4 o3

#if ( KBB / 8 ) % 2
#define x5  x4 p2(o4)
#define o5 S(32,o4)
#else
#define x5 x4
#define o5 o4
#endif

/*  #define x5 x4 drain_pipe(o4) */
/*  #define o5 o4 */

#if ( KBB / 4 ) % 2
#define x6  x5 p1(o5)
#define o6 S(16,o5)
#else
#define x6 x5
#define o6 o5
#endif

#if ( KB / 2 ) % 2
#define x7  x6 p1_2(o6)
#define o7 S(8,o6)
#else
#define x7 x6
#define o7 o6
#endif

#if ( KB / 1 ) % 2 
#define x8 x7 p1_4(o7)
#define o8 S(4,o7)
#else
#define x8 x7
#define o8 o7
#endif

#define z1(a_,b_) phl(a_,b_) pa(b_,a_) pc(a_,b_) ps(1,b_,b_) pasr(b_,a_)
#define z    z1(4,0) pus(4,0,cx) 

#ifdef SREAL
#define CINC 4
#define LDCM 1
#else
#define CINC 8
#define LDCM 2
#endif

#ifdef BETA0
#define w    px(4)
#endif
#ifdef BETA1
#define w    pls(0,cx,4) 
#endif
#ifdef BETAX
#define w    pls(0,cx,4) pmsr(3,4) 
#endif


void
ATL_USERMM (int m, int n, int k, float alpha, const float *a,
	    int lda,const float *b, int ldb, float beta, float *c,
	    int ldc) {

  float *bbp=&beta;

  ASM ( 

#if KB % 4
#error KB must be divisible by four -- m n cleanup needs alignment
#endif

#ifdef BETAX
       pls(0,di,3)
#endif

       "pushl %%ebx\n\t"
       "movl  %%esi,%%ebx\n\t"

       a(4,sp)

       "movl %4,%%esi\n\t"
       "movl %5,%%edi\n\t"

       a(-4,sp)

       "pushl %%ebp\n\t"
       "movl %%edi,%%ebp\n\t"

       a(8,sp)
       "movl %6,%%edi\n\t"
       a(-8,sp)

       lab(loopb)
       
       "pushl %%eax\n\t"

       lab(loopa)
       
       cmp(ax,di)
       je(lit)
       
       W
       y8
       Z
       
       a(S(KB8,KB8),ax)
       a(S(S(CINC,CINC),S(CINC,CINC)),cx)
       jmp(loopa)

       lab(lit)
       cmp(ax,si)
       je(cstop)

       w
       x8
       z

       a(KB4,ax)
       a(CINC,cx)
       
       jmp(lit)
       lab(cstop)

       "popl %%eax\n\t"
       ra(dx,cx)
       a(KB4,bx)
       
       cmp(bx,bp)
       jne(loopb)
       
       "popl %%ebp\n\t"
       "popl %%ebx\n\t"


       ::"a" (a),"S" (b),"c" (c),"d" ((ldc-m)*LDCM*sizeof(*c)),
       "m" (a+m*KB),"m" (b+n*KB),"m" (a+((m>>2)<<2)*KB)
#ifdef BETAX
       ,"D" (bbp):"memory");
#else
       :"di","memory");
#endif
  
}
