Blitz logo

Blitz Support :

From: Marc Vaillant (marc_at_[hidden])
Date: 2003-06-30 19:43:34


Hi,

I´ve recently started using blitz, which I really love. However, I'm
taking performance hits for the level of abstraction I would like (not
necessarily having anything to do with blitz). Below is an example of
two functions which perform the exact same computation -- the first
uses the beautiful tensor notation, and the second uses only loops.
Both implementations are too slow; on my Xeon the tensor routine takes
27 sec, and the loop routine 8 sec for numLmks=70 and dim=3. I
realize that they are naive implementations in terms of performance so
I'm not blaming blitz. I'm just wondering if anyone has suggestions
for speed up without sacrificing clarity of the code. If I have to
pack everything into lower dimensional arrays and use blas products I
guess that's what I'll have to do...

Thanks,
Marc

typedef Array<double,1> array1D ;
typedef Array<double,2> array2D ;
typedef Array<double,3> array3D ;
typedef Array<double,4> array4D ;
typedef Array<double,5> array5D ;
typedef Array<double,6> array6D ;
typedef Array<double,7> array7D ;

void computeDadadt(
                array4D& dadadt,
                const array6D& ddSdxx,
                const array6D& ddSdyx,
                const array5D& dSdx,
                const array4D& dqda,
                const array4D& dada,
                const array4D& S,
                const array2D& q,
                const array2D& a)
{
  int numLmks=q.rows();
  int dim=q.columns();
  array7D temp7(numLmks,dim,numLmks,numLmks,dim,dim,dim);
  array6D temp6(numLmks,dim,numLmks,numLmks,dim,dim);
  array5D temp5(numLmks,dim,numLmks,numLmks,dim);
  using namespace blitz::tensor;
  
  //first 2 terms
  temp7=sum((ddSdxx(k,l,m,n,o,p)*dqda(i,j,k,m)+ddSdyx(k,l,m,n,o,p)*dqda(i,j,l,m))
                  *a(l,p)*a(k,o),p);
  //add in 3rd and 4th terms then do reduction
  temp6=sum(temp7(i,j,k,l,o,m,n)
          +dSdx(k,l,m,n,o)*(dada(i,j,l,o)*a(k,n)+a(l,o)*dada(i,j,k,n)),o);
  temp5=sum(temp6(i,j,k,l,m,n),n);
  dadadt=-sum(temp5(i,j,k,m,l),m);
}

void computeDadadt_loops(
                array4D& dadadt,
                const array6D& ddSdxx,
                const array6D& ddSdyx,
                const array5D& dSdx,
                const array4D& dqda,
                const array4D& dada,
                const array4D& S,
                const array2D& q,
                const array2D& a)
{
  int i,j,k,l,m,n,o,p;
  int numLmks=q.rows();
  int dim=q.columns();
  array7D temp7(numLmks,dim,numLmks,numLmks,dim,dim,dim);
  array6D temp6(numLmks,dim,numLmks,numLmks,dim,dim);
  array5D temp5(numLmks,dim,numLmks,numLmks,dim);

  temp7=0.;
  temp6=0.;
  temp5=0.;
  dadadt=0.;

  for(i=0;i<numLmks;i++)
    for(j=0;j<dim;j++)
      for(k=0;k<numLmks;k++)
        for(l=0;l<numLmks;l++)
        {
          for(m=0;m<dim;m++)
          {
            for(n=0;n<dim;n++)
              for(o=0;o<dim;o++)
              {
                for(p=0;p<dim;p++)
                  temp7(i,j,k,l,m,n,o)+=(ddSdxx(k,l,m,n,o,p)*dqda(i,j,k,m)+ddSdyx(k,l,m,n,o,p)*dqda(i,j,l,m))*a(l,p)*a(k,o);
                temp6(i,j,k,l,m,n)+=dSdx(k,l,m,n,o)*(dada(i,j,l,o)*a(k,n)+a(l,o)*dada(i,j,k,n));
              }
            for(n=0;n<dim;n++)
              for(o=0;o<dim;o++)
                temp6(i,j,k,l,n,o)+=temp7(i,j,k,l,m,n,o);
          }
          for(m=0;m<dim;m++)
          {
            for(n=0;n<dim;n++)
              temp5(i,j,k,l,m)+=temp6(i,j,k,l,m,n);
            dadadt(i,j,k,m)-=temp5(i,j,k,l,m);
          }
        }
}