Blitz++ transform.cpp example

Here's an equation which transforms a vector y using a principal rotation about the third axis:

Here's how this transformation might be coded in Blitz++. Note that the mathematical structure of the transformation (a matrix multiplication) is preserved:
void transform(double alpha, TinyVector<double,3>& x,
    const TinyVector<double,3>& y)
{
    TinyMatrix<double,3,3> C;

    double cosa = cos(alpha),
           sina = sin(alpha);

    // Create the principal rotation matrix C_3(alpha)
    C = cosa, -sina,   0.0,
        sina,  cosa,   0.0,
         0.0,   0.0,   1.0;

    x = product(C,y);
}
Here's a low-level implementation:
void transform2(double alpha, double* x, 
    double* y)
{
    double c = cos(alpha),
           s = sin(alpha);

    x[0] = c * y[0] - s * y[1];
    x[1] = s * y[0] + c * y[1];
    x[2] = y[2];
}
You would expect that the low-level implementation would be much faster, since it has removed much of the matrix-vector multiplication. However, using KAI C++ at +K3 -O3 on an RS/6000 (xlC back end), the high-level Blitz++ code is almost identical to the low-level code. The only difference is a single instruction (the Blitz++ version does a fneg/fma instead of a fms):
Blitz++ Assembly Low-level Assembly
.transform:
stfd    fp31,-8(SP)
stfd    fp30,-16(SP)
stm     r30,-24(SP)
fmr     fp31,fp1
mfspr   r0,LR
cal     r30,0(r5)
st      r0,8(SP)
stu     SP,-80(SP)
cal     r31,0(r6)
bl      .cos{PR}
cror    CR3_SO,CR3_SO,CR3_SO
fmr     fp30,fp1
fmr     fp1,fp31
bl      .sin{PR}
cror    CR3_SO,CR3_SO,CR3_SO
lfd     fp2,8(r31)
fneg    fp0,fp1
lfd     fp3,0(r31)
fm      fp0,fp2,fp0
fma     fp0,fp30,fp3,fp0
stfd    fp0,0(r30)
lfd     fp0,8(r31)
lfd     fp2,0(r31)
fm      fp0,fp30,fp0
fma     fp0,fp2,fp1,fp0
l       r12,88(SP)
lfd     fp31,72(SP)
mtspr   LR,r12
stfd    fp0,8(r30)
lfd     fp0,16(r31)
lfd     fp30,64(SP)
ai      SP,SP,80
stfd    fp0,16(r30)
lm      r30,-24(SP)
bcr     BO_ALWAYS,CR0_LT
.transform2:
stfd    fp31,-8(SP)
stfd    fp30,-16(SP)
stm     r30,-24(SP)
fmr     fp31,fp1
mfspr   r0,LR
cal     r30,0(r5)
st      r0,8(SP)
stu     SP,-80(SP)
cal     r31,0(r6)
bl      .cos{PR}
cror    CR3_SO,CR3_SO,CR3_SO
fmr     fp30,fp1
fmr     fp1,fp31
bl      .sin{PR}
cror    CR3_SO,CR3_SO,CR3_SO
lfd     fp0,8(r31)
lfd     fp2,0(r31)
fm      fp0,fp1,fp0
fms     fp0,fp30,fp2,fp0
stfd    fp0,0(r30)
lfd     fp0,8(r31)
lfd     fp2,0(r31)
fm      fp0,fp30,fp0
fma     fp0,fp2,fp1,fp0
l       r12,88(SP)
lfd     fp31,72(SP)
mtspr   LR,r12
stfd    fp0,8(r30)
lfd     fp0,16(r31)
lfd     fp30,64(SP)
ai      SP,SP,80
stfd    fp0,16(r30)
lm      r30,-24(SP)
bcr     BO_ALWAYS,CR0_LT