Blitz logo

Blitz Support :

From: Xianglong Yuan (yuanx_at_[hidden])
Date: 2004-08-21 08:30:59


Thanks, Mathias and Xavier.

I've tried cblas_dgemm() function with Intel MKL-7.0 library, as
Mathias suggested, and with ATLAS-3.7.8 library, as Xavier suggested.
I didn't tried GSL since Mathias mentioned it was slower than MKL.
Furthermore, GSL has its own interface and matrix structure. The
results with cblas_dgemm() are:

with ATLAS library:

> straight plain array: 140000
> cblas with plain array: 200000
> cblas with blitz array: 200000

(icc -O3 -tpp7 -xN -cblas -atlas)

with Intel MKL library:

> straight plain array: 140000
> cblas with plain array: 240000
> cblas with blitz array: 240000

(icc -O3 -tpp7 -xN -lmkl)

Note 1: ATLAS libraries are static links, whereas MKL library is
dynamic link to libmkl.so. Static link -lmkl_ia32 failed.

Note 2: I cannot use -fast option, which is equivalent to "-O3 -ipo
-static". Either -static or -ipo option gives error message.

I again attach my 'cblas_test.cpp' below. I don't think it worths to
go with either ATLAS or Intel MKL libraries. Your comments and/or
suggestions on how to improve the performance will be appreciated.

Thanks all for your helps.

Xianglong

==============================

Xavier: Xeon has a much larger cache, AFAIK. That explains why your
results have a 3-time difference.

Toby: Thanks. Now I know why your Matrix declaration does not have
<double> after. I thought the Matrix is the Matrix<double> in Blitz.

TIA: You are right. It is a well-defined piece of linear algebra. I
always write matrix with columns first. I guess I remember now it is
the ROWS comes first. :-))

Xianglong

======================
// cblas_test.cpp

#include <ctime>
#include <cassert>
#include <iostream>
#include "blitz/array.h"
#include "mkl_cblas.h"
//#include "cppblas.h"
// cppblas.h is a cpp wrapper of cblas.h:
// extern "C" {
// #include "cblas.h"
// }
using namespace std;

int main()
{
  static const int ROWS_A = 2000000;
  static const int COLS_A = 3;
  static const int COLS_B = 3;
  static const int ROWS_B = COLS_A;
  clock_t t_start, t_stop;

  double* orig_A = new double[ROWS_A*COLS_A];
  double* orig_B = new double[ROWS_B*COLS_B];

  for (int i = 0; i < ROWS_A*COLS_A; i++)
    orig_A[i] = random()/(RAND_MAX+1.0);
  for (int i = 0; i < ROWS_B*COLS_B; i++)
    orig_B[i] = 20.0 * random()/(RAND_MAX+1.0);

//* start plain array

  double* A1 = new double[ROWS_A*COLS_A];
  double* B1 = new double[ROWS_B*COLS_B];

  for (int i = 0; i < ROWS_A; i++)
    for (int j = 0; j < COLS_A; j++)
      A1[i*COLS_A+j] = orig_A[i*COLS_A+j];

  for (int i = 0; i < ROWS_B; i++)
    for (int j = 0; j < COLS_B; j++)
      B1[i*COLS_B+j] = orig_B[i*COLS_B+j];

  double* C1 = new double[ROWS_A*COLS_B];
  assert(COLS_A == ROWS_B);

  t_start = clock();

  for (int i = 0; i < ROWS_A; i++)
    for (int j = 0; j < COLS_B; j++) {
      C1[i*COLS_B+j] = 0.0;
      for (int k = 0; k < COLS_A; k++)
        C1[i*COLS_B+j] += A1[i*COLS_A+k] * B1[k*COLS_B+j];
    }

  t_stop = clock();
  cout << "straight plain array: " << t_stop - t_start << endl;

// t_start = clock();
//
// cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
// ROWS_A, COLS_B, COLS_A, 1.0,
// A1, COLS_A, B1, COLS_B, 0.0, C1, COLS_B);
//
// t_stop = clock();
// cout << "cblas with plain array: " << t_stop - t_start << endl;
  
  delete[] A1, B1, C1;

// end plain array */

//* start blitz array

  blitz::Range rA(0, ROWS_A-1), cA(0, COLS_A-1);
  blitz::Range rB(0, ROWS_B-1), cB(0, COLS_B-1);
  blitz::Array<double, 2> A2(rA, cA), B2(rB, cB);

  for (int i = 0; i < ROWS_A; i++)
    for (int j = 0; j < COLS_A; j++)
      A2(i,j) = orig_A[i*3+j];

  for (int i = 0; i < ROWS_B; i++)
    for (int j = 0; j < COLS_B; j++)
      B2(i,j) = orig_B[i*3+j];

  blitz::Array<double, 2> C2(rA, cB);
  assert(A2.columns() == B2.rows());

  t_start = clock();

  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
      C2.rows(), C2.columns(), A2.columns(), 1.0,
      A2.data(), A2.columns(),
      B2.data(), B2.columns(), 0.0,
      C2.data(), C2.columns());

  t_stop = clock();
  cout << "cblas with blitz array: " << t_stop - t_start << endl;

// end blitz array */

  return 0;
}