![]() |
Blitz Support : |
From: Xianglong Yuan (yuanx_at_[hidden])
Date: 2004-08-21 08:30:59
Thanks, Mathias and Xavier.
I've tried cblas_dgemm() function with Intel MKL-7.0 library, as
Mathias suggested, and with ATLAS-3.7.8 library, as Xavier suggested.
I didn't tried GSL since Mathias mentioned it was slower than MKL.
Furthermore, GSL has its own interface and matrix structure. The
results with cblas_dgemm() are:
with ATLAS library:
> straight plain array: 140000
> cblas with plain array: 200000
> cblas with blitz array: 200000
(icc -O3 -tpp7 -xN -cblas -atlas)
with Intel MKL library:
> straight plain array: 140000
> cblas with plain array: 240000
> cblas with blitz array: 240000
(icc -O3 -tpp7 -xN -lmkl)
Note 1: ATLAS libraries are static links, whereas MKL library is
dynamic link to libmkl.so. Static link -lmkl_ia32 failed.
Note 2: I cannot use -fast option, which is equivalent to "-O3 -ipo
-static". Either -static or -ipo option gives error message.
I again attach my 'cblas_test.cpp' below. I don't think it worths to
go with either ATLAS or Intel MKL libraries. Your comments and/or
suggestions on how to improve the performance will be appreciated.
Thanks all for your helps.
Xianglong
==============================
Xavier: Xeon has a much larger cache, AFAIK. That explains why your
results have a 3-time difference.
Toby: Thanks. Now I know why your Matrix declaration does not have
<double> after. I thought the Matrix is the Matrix<double> in Blitz.
TIA: You are right. It is a well-defined piece of linear algebra. I
always write matrix with columns first. I guess I remember now it is
the ROWS comes first. :-))
Xianglong
======================
// cblas_test.cpp
#include <ctime>
#include <cassert>
#include <iostream>
#include "blitz/array.h"
#include "mkl_cblas.h"
//#include "cppblas.h"
// cppblas.h is a cpp wrapper of cblas.h:
// extern "C" {
// #include "cblas.h"
// }
using namespace std;
int main()
{
static const int ROWS_A = 2000000;
static const int COLS_A = 3;
static const int COLS_B = 3;
static const int ROWS_B = COLS_A;
clock_t t_start, t_stop;
double* orig_A = new double[ROWS_A*COLS_A];
double* orig_B = new double[ROWS_B*COLS_B];
for (int i = 0; i < ROWS_A*COLS_A; i++)
orig_A[i] = random()/(RAND_MAX+1.0);
for (int i = 0; i < ROWS_B*COLS_B; i++)
orig_B[i] = 20.0 * random()/(RAND_MAX+1.0);
//* start plain array
double* A1 = new double[ROWS_A*COLS_A];
double* B1 = new double[ROWS_B*COLS_B];
for (int i = 0; i < ROWS_A; i++)
for (int j = 0; j < COLS_A; j++)
A1[i*COLS_A+j] = orig_A[i*COLS_A+j];
for (int i = 0; i < ROWS_B; i++)
for (int j = 0; j < COLS_B; j++)
B1[i*COLS_B+j] = orig_B[i*COLS_B+j];
double* C1 = new double[ROWS_A*COLS_B];
assert(COLS_A == ROWS_B);
t_start = clock();
for (int i = 0; i < ROWS_A; i++)
for (int j = 0; j < COLS_B; j++) {
C1[i*COLS_B+j] = 0.0;
for (int k = 0; k < COLS_A; k++)
C1[i*COLS_B+j] += A1[i*COLS_A+k] * B1[k*COLS_B+j];
}
t_stop = clock();
cout << "straight plain array: " << t_stop - t_start << endl;
// t_start = clock();
//
// cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
// ROWS_A, COLS_B, COLS_A, 1.0,
// A1, COLS_A, B1, COLS_B, 0.0, C1, COLS_B);
//
// t_stop = clock();
// cout << "cblas with plain array: " << t_stop - t_start << endl;
delete[] A1, B1, C1;
// end plain array */
//* start blitz array
blitz::Range rA(0, ROWS_A-1), cA(0, COLS_A-1);
blitz::Range rB(0, ROWS_B-1), cB(0, COLS_B-1);
blitz::Array<double, 2> A2(rA, cA), B2(rB, cB);
for (int i = 0; i < ROWS_A; i++)
for (int j = 0; j < COLS_A; j++)
A2(i,j) = orig_A[i*3+j];
for (int i = 0; i < ROWS_B; i++)
for (int j = 0; j < COLS_B; j++)
B2(i,j) = orig_B[i*3+j];
blitz::Array<double, 2> C2(rA, cB);
assert(A2.columns() == B2.rows());
t_start = clock();
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
C2.rows(), C2.columns(), A2.columns(), 1.0,
A2.data(), A2.columns(),
B2.data(), B2.columns(), 0.0,
C2.data(), C2.columns());
t_stop = clock();
cout << "cblas with blitz array: " << t_stop - t_start << endl;
// end blitz array */
return 0;
}