void ATL_USERMM
   (const int M, const int N, const int K, const double alpha, const double *A, const int lda0, const double *B, const int ldb0, const double beta, double *C, const int ldc)
{
   register double b0, b1, c0;
   const double *pA; 
   double *pC=C;
   register int i, j, k;
   #define lda KB
   #define ldb KB

   for (j=0; j != NB; j++)
   {
      pC = C;
      pA = A;
      for (k=0; k != KB; k += 2)
      {
         b0 = B[k];
         b1 = B[k+1];
         for (i=0; i != MB; i += 8)
         {
            pC[i] += pA[i*lda] * b0 + pA[i*lda+1] * b1;
            pC[i+1] += pA[(i+1)*lda] * b0 + pA[(i+1)*lda+1] * b1;
            pC[i+2] += pA[(i+2)*lda] * b0 + pA[(i+2)*lda+1] * b1;
            pC[i+3] += pA[(i+3)*lda] * b0 + pA[(i+3)*lda+1] * b1;
            pC[i+4] += pA[(i+4)*lda] * b0 + pA[(i+4)*lda+1] * b1;
            pC[i+5] += pA[(i+5)*lda] * b0 + pA[(i+5)*lda+1] * b1;
            pC[i+6] += pA[(i+6)*lda] * b0 + pA[(i+6)*lda+1] * b1;
            pC[i+7] += pA[(i+7)*lda] * b0 + pA[(i+7)*lda+1] * b1;
         }
         pC += ldc;
         pA += 2;
      }
      B += ldb;
   }
}
