Differences between revisions 1 and 2

Deletions are marked like this. Additions are marked like this.
Line 13: Line 13:

/* MY_MMult_inner prototype */
void MY_MMult_inner( int m, int n, int k, double *a, int lda,
                                    double *b, int ldb,
                                    double *c, int ldc );
Line 62: Line 67:
 aip = *ap++;         aip = *ap++;
Line 69: Line 74:
 aip = *ap++;         aip = *ap++;
Line 76: Line 81:
 aip = *ap++;         aip = *ap++;
Line 83: Line 88:
 aip = *ap++;         aip = *ap++;

/* Create macros so that the matrices are stored in column-major order */

#define A(i,j) a[ (j)*lda + (i) ]
#define AT(i,j) at[ (j)*(k) + (i) ]
#define B(i,j) b[ (j)*ldb + (i) ]
#define C(i,j) c[ (j)*ldc + (i) ]

#define min( x, y ) ( x < y ? x : y )

#define MB 144
#define KB 144

/* MY_MMult_inner prototype */
void MY_MMult_inner( int m, int n, int k, double *a, int lda, 
                                    double *b, int ldb,
                                    double *c, int ldc );


/* Routine for computing C = A * B + C */

void MY_MMult( int m, int n, int k, double *a, int lda, 
                                    double *b, int ldb,
                                    double *c, int ldc )
{
  int i, p, ib, pb;

  for ( p=0; p<k; p+=KB ){
    pb = min( k-p, KB );
    for ( i=0; i<m; i+=MB ){
      ib = min( m-i, MB );

      MY_MMult_inner( ib, n, pb, 
                      &A( i, p ), lda, 
                      &B( p, 0 ), ldb, 
                      &C( i, 0 ), ldc );
    }
  }
}


  


/* Inner kernel for computing C = A * B + C */

void MY_MMult_inner( int m, int n, int k, double *a, int lda, 
                                          double *b, int ldb,
                                          double *c, int ldc )
{
  int i, j, p;
  double at[ MB * KB ];

  transpose_matrix( m, k, a, lda, at, k );

  for ( j=0; j<n; j+=4 ){
    double *c0p = &C(0,j), *c1p = &C(0,j+1), *c2p = &C(0,j+2), *c3p = &C(0,j+3);

    for ( i=0; i<m; i++ ){
      register double c0=0.0, c1=0.0, c2=0.0, c3=0.0;
      double *ap = &AT(0,i), *b0p=&B(0,j  ), *b1p=&B(0,j+1), 
                            *b2p=&B(0,j+2), *b3p=&B(0,j+3);

      for ( p=0; p<k; p+=4 ){
        register double aip;

        aip = *ap++;

        c0 = c0 + aip * b0p[0];
        c1 = c1 + aip * b1p[0];
        c2 = c2 + aip * b2p[0];
        c3 = c3 + aip * b3p[0];

        aip = *ap++;

        c0 = c0 + aip * b0p[1];
        c1 = c1 + aip * b1p[1];
        c2 = c2 + aip * b2p[1];
        c3 = c3 + aip * b3p[1];

        aip = *ap++;

        c0 = c0 + aip * b0p[2];
        c1 = c1 + aip * b1p[2];
        c2 = c2 + aip * b2p[2];
        c3 = c3 + aip * b3p[2];

        aip = *ap++;

        c0 = c0 + aip * b0p[3];
        c1 = c1 + aip * b1p[3];
        c2 = c2 + aip * b2p[3];
        c3 = c3 + aip * b3p[3];

        b0p+=4;
        b1p+=4;
        b2p+=4;
        b3p+=4;
      }
      
      *c0p++ += c0;
      *c1p++ += c1;
      *c2p++ += c2;
      *c3p++ += c3;
    }
  }
}

LinearAlgebraWiki: OptimizingGemm/Details/MMult16 (last edited 2009-11-02 13:42:39 by RobertVanDeGeijn)