transform.cpp exampley using a
principal rotation about the third axis:

void transform(double alpha, TinyVector<double,3>& x,
const TinyVector<double,3>& y)
{
TinyMatrix<double,3,3> C;
double cosa = cos(alpha),
sina = sin(alpha);
// Create the principal rotation matrix C_3(alpha)
C = cosa, -sina, 0.0,
sina, cosa, 0.0,
0.0, 0.0, 1.0;
x = product(C,y);
}
Here's a low-level implementation:
void transform2(double alpha, double* x,
double* y)
{
double c = cos(alpha),
s = sin(alpha);
x[0] = c * y[0] - s * y[1];
x[1] = s * y[0] + c * y[1];
x[2] = y[2];
}
You would expect that the low-level implementation would be much
faster, since it has removed much of the matrix-vector multiplication.
However, using KAI C++ at +K3 -O3 on an RS/6000 (xlC back end), the
high-level Blitz++ code is almost identical to the low-level code.
The only difference is a single instruction (the Blitz++ version does
a fneg/fma instead of a fms):
| Blitz++ Assembly | Low-level Assembly |
|---|---|
.transform:
stfd fp31,-8(SP)
stfd fp30,-16(SP)
stm r30,-24(SP)
fmr fp31,fp1
mfspr r0,LR
cal r30,0(r5)
st r0,8(SP)
stu SP,-80(SP)
cal r31,0(r6)
bl .cos{PR}
cror CR3_SO,CR3_SO,CR3_SO
fmr fp30,fp1
fmr fp1,fp31
bl .sin{PR}
cror CR3_SO,CR3_SO,CR3_SO
lfd fp2,8(r31)
fneg fp0,fp1
lfd fp3,0(r31)
fm fp0,fp2,fp0
fma fp0,fp30,fp3,fp0
stfd fp0,0(r30)
lfd fp0,8(r31)
lfd fp2,0(r31)
fm fp0,fp30,fp0
fma fp0,fp2,fp1,fp0
l r12,88(SP)
lfd fp31,72(SP)
mtspr LR,r12
stfd fp0,8(r30)
lfd fp0,16(r31)
lfd fp30,64(SP)
ai SP,SP,80
stfd fp0,16(r30)
lm r30,-24(SP)
bcr BO_ALWAYS,CR0_LT
|
.transform2:
stfd fp31,-8(SP)
stfd fp30,-16(SP)
stm r30,-24(SP)
fmr fp31,fp1
mfspr r0,LR
cal r30,0(r5)
st r0,8(SP)
stu SP,-80(SP)
cal r31,0(r6)
bl .cos{PR}
cror CR3_SO,CR3_SO,CR3_SO
fmr fp30,fp1
fmr fp1,fp31
bl .sin{PR}
cror CR3_SO,CR3_SO,CR3_SO
lfd fp0,8(r31)
lfd fp2,0(r31)
fm fp0,fp1,fp0
fms fp0,fp30,fp2,fp0
stfd fp0,0(r30)
lfd fp0,8(r31)
lfd fp2,0(r31)
fm fp0,fp30,fp0
fma fp0,fp2,fp1,fp0
l r12,88(SP)
lfd fp31,72(SP)
mtspr LR,r12
stfd fp0,8(r30)
lfd fp0,16(r31)
lfd fp30,64(SP)
ai SP,SP,80
stfd fp0,16(r30)
lm r30,-24(SP)
bcr BO_ALWAYS,CR0_LT
|