Overview Features Coding ApolloOS Performance Forum Downloads Products Order Contact

Welcome to the Apollo Forum

This forum is for people interested in the APOLLO CPU.
Please read the forum usage manual.
Please visit our Apollo-Discord Server for support.



All TopicsNewsPerformanceGamesDemosApolloVampireAROSWorkbenchATARIReleases
Information about the Apollo CPU and FPU.

GCC Improvement for 68080page  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 

Stefan "Bebbo" Franke

Posts 139
01 Jul 2019 16:10


Gunnar von Boehn wrote:

Did you saw my post reporting that unneeded longer EA modes are used?
  How could this happen with -Os

-Os considers the insn cost - not the insn size.
Calculation of the costs is either done for speed or for size.
And finally the cost calculation must match the expectations.


Gunnar von Boehn
(Apollo Team Member)
Posts 6207
01 Jul 2019 17:19


Stefan "Bebbo" Franke wrote:

Gunnar von Boehn wrote:

  Did you saw my post reporting that unneeded longer EA modes are used?
  How could this happen with -Os
 

 
  -Os considers the insn cost - not the insn size.
  Calculation of the costs is either done for speed or for size.
  And finally the cost calculation must match the expectations.

Not sure I understand your answer.

What I tried to say is that the generated code with "-Os" is not optimal yet - its both slower and fatter than what we expect. :(

  Lets look here

#include <string.h>
void Scale(double scalar, double* b, double* c)
{
    size_t j;
    double t1;
    double t2;
    double t3;
    double t4;
    for (j=1000; j; j--){
        t1 = scalar* *c++;
        t2 = scalar* *c++;
        t3 = scalar* *c++;
        t4 = scalar* *c++;
        *b++ =t1;
        *b++ =t2;
        *b++ =t3;
        *b++ =t4;
    }
}

 

-Os -m68080 -mhard-float -fomit-frame-pointer

I think that this "fictive" code would both be SMALLER and faster.


_Scale:
          fmovem #28,-(sp)
          fdmove.d (40,sp),fp0
          move.l (48,sp),a0
          move.l (52,sp),a1
          move.l #1000,d0
.L2:
          fdmove.x fp0,fp3
          fdmul.d  (a1)+,fp3
          fdmove.x fp0,fp2
          fdmul.d  (a1)+,fp2
          fdmove.x fp0,fp1
          fdmove.d (a1)+,fp4
          fdmul.x  fp0,fp4
          fdmul.d  (a1)+,fp1
          fmove.d  fp4,(a0)+
          fmove.d  fp3,(a0)+
          fmove.d  fp2,(a0)+
          fmove.d  fp1,(a0)+
          subq.l #1,d0
          jne .L2
          fmovem (sp)+,#56
          rts

What do you think?


Steve Ferrell

Posts 424
01 Jul 2019 17:24


Stefan "Bebbo" Franke wrote:

Steve Ferrell wrote:

  This is probably more appropriate:
  ...
  void multiplyMatrix()
  {
      for(int i = 0; i < 4; i++ ){
          for(int j = 0; j < 1; j++){
              outputMatrix[j] = 0;
              for(int k = 0; k < 4; k++){
                  outputMatrix[j] += rotationMatrix[k] * inputMatrix[k][j];
              }
          }
      }
  }
  ...
 

 
  does not compile

Strange it compiled for me under both compilers.  Here's the 6.5.0b output:

        link.w a5,#0
        moveq #0,d0
        unlk a5
        rts
__ZSt3cosf:
        link.w a5,#0
        move.l (8,a5),-(sp)
        jsr _cosf
        addq.l #4,sp
        unlk a5
        rts
__ZSt3sinf:
        link.w a5,#0
        move.l (8,a5),-(sp)
        jsr _sinf
        addq.l #4,sp
        unlk a5
        rts
__ZSt4sqrtf:
        link.w a5,#0
        move.l (8,a5),-(sp)
        jsr _sqrtf
        addq.l #4,sp
        unlk a5
        rts
_points:
_rotationMatrix:
_inputMatrix:
_outputMatrix:
.LC0:
        .ascii "(\0"
.LC1:
        .ascii ",\0"
.LC2:
        .ascii ")\0"
__Z9showPointv:
        link.w a5,#0
        movem.l d4/d3/d2,-(sp)
        move.l 8+_outputMatrix,d2
        move.l 4+_outputMatrix,d3
        move.l _outputMatrix,d4
        pea .LC0
        pea __ZSt4cout
        jsr __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
        addq.l #8,sp
        move.l d4,-(sp)
        move.l d0,-(sp)
        jsr __ZNSolsEf
        addq.l #8,sp
        pea .LC1
        move.l d0,-(sp)
        jsr __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
        addq.l #8,sp
        move.l d3,-(sp)
        move.l d0,-(sp)
        jsr __ZNSolsEf
        addq.l #8,sp
        pea .LC1
        move.l d0,-(sp)
        jsr __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
        addq.l #8,sp
        move.l d2,-(sp)
        move.l d0,-(sp)
        jsr __ZNSolsEf
        addq.l #8,sp
        pea .LC2
        move.l d0,-(sp)
        jsr __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
        addq.l #8,sp
        pea __ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_
        move.l d0,-(sp)
        jsr __ZNSolsEPFRSoS_E
        addq.l #8,sp
        nop
        movem.l (-12,a5),d2/d3/d4
        unlk a5
        rts
__Z14multiplyMatrixv:
        link.w a5,#-12
        move.l d2,-(sp)
        clr.l (-4,a5)
.L16:
        moveq #3,d0
        cmp.l (-4,a5),d0
        jlt .L17
        clr.l (-8,a5)
.L15:
        tst.l (-8,a5)
        jgt .L12
        move.l (-4,a5),d0
        add.l (-8,a5),d0
        lsl.l #2,d0
        lea _outputMatrix,a0
        clr.l (0,a0,d0.l)
        clr.l (-12,a5)
.L14:
        moveq #3,d0
        cmp.l (-12,a5),d0
        jlt .L13
        move.l (-4,a5),d0
        add.l (-8,a5),d0
        lsl.l #2,d0
        lea _outputMatrix,a0
        move.l (0,a0,d0.l),d2
        move.l (-4,a5),d0
        lsl.l #2,d0
        add.l (-12,a5),d0
        lsl.l #2,d0
        lea _rotationMatrix,a0
        move.l (0,a0,d0.l),d1
        move.l (-8,a5),d0
        add.l (-12,a5),d0
        lsl.l #2,d0
        lea _inputMatrix,a0
        move.l (0,a0,d0.l),d0
        move.l d0,-(sp)
        move.l d1,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,-(sp)
        move.l d2,-(sp)
        jsr ___addsf3
        addq.l #8,sp
        move.l d0,d1
        move.l (-4,a5),d0
        add.l (-8,a5),d0
        lsl.l #2,d0
        lea _outputMatrix,a0
        move.l d1,(0,a0,d0.l)
        addq.l #1,(-12,a5)
        jra .L14
.L13:
        addq.l #1,(-8,a5)
        jra .L15
.L12:
        addq.l #1,(-4,a5)
        jra .L16
.L17:
        nop
        move.l (-16,a5),d2
        unlk a5
        rts
__Z19setUpRotationMatrixffff:
        link.w a5,#-16
        move.l d3,-(sp)
        move.l d2,-(sp)
        move.l a5,a0
        move.l (12,a0),-(sp)
        move.l a5,a0
        move.l (12,a0),-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d2
        move.l a5,a0
        move.l (16,a0),-(sp)
        move.l a5,a0
        move.l (16,a0),-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,-(sp)
        move.l d2,-(sp)
        jsr ___addsf3
        addq.l #8,sp
        move.l d0,d2
        move.l a5,a0
        move.l (20,a0),-(sp)
        move.l a5,a0
        move.l (20,a0),-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,-(sp)
        move.l d2,-(sp)
        jsr ___addsf3
        addq.l #8,sp
        move.l d0,(-4,a5)
        move.l (8,a5),-(sp)
        jsr ___extendsfdf2
        addq.l #4,sp
        move.l #1413754136,-(sp)
        move.l #1074340347,-(sp)
        move.l d1,-(sp)
        move.l d0,-(sp)
        jsr ___muldf3
        lea (16,sp),sp
        clr.l -(sp)
        move.l #1080459264,-(sp)
        move.l d1,-(sp)
        move.l d0,-(sp)
        jsr ___divdf3
        lea (16,sp),sp
        move.l d1,-(sp)
        move.l d0,-(sp)
        jsr ___truncdfsf2
        addq.l #8,sp
        move.l d0,(8,a5)
        move.l a5,a0
        move.l (12,a0),-(sp)
        move.l a5,a0
        move.l (12,a0),-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,(-8,a5)
        move.l a5,a0
        move.l (16,a0),-(sp)
        move.l a5,a0
        move.l (16,a0),-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,(-12,a5)
        move.l a5,a0
        move.l (20,a0),-(sp)
        move.l a5,a0
        move.l (20,a0),-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,(-16,a5)
        move.l (-16,a5),-(sp)
        move.l (-12,a5),-(sp)
        jsr ___addsf3
        addq.l #8,sp
        move.l d0,d2
        move.l (8,a5),-(sp)
        jsr __ZSt3cosf
        addq.l #4,sp
        move.l d0,-(sp)
        move.l d2,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l (-8,a5),-(sp)
        move.l d0,-(sp)
        jsr ___addsf3
        addq.l #8,sp
        move.l (-4,a5),-(sp)
        move.l d0,-(sp)
        jsr ___divsf3
        addq.l #8,sp
        move.l d0,_rotationMatrix
        move.l a5,a0
        move.l (16,a0),-(sp)
        move.l a5,a0
        move.l (12,a0),-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d2
        move.l (8,a5),-(sp)
        jsr __ZSt3cosf
        addq.l #4,sp
        move.l d0,-(sp)
        move.l #0x3f800000,-(sp)
        jsr ___subsf3
        addq.l #8,sp
        move.l d0,-(sp)
        move.l d2,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d2
        move.l (-4,a5),-(sp)
        jsr __ZSt4sqrtf
        addq.l #4,sp
        move.l a5,a0
        move.l (20,a0),-(sp)
        move.l d0,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d3
        move.l (8,a5),-(sp)
        jsr __ZSt3sinf
        addq.l #4,sp
        move.l d0,-(sp)
        move.l d3,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,-(sp)
        move.l d2,-(sp)
        jsr ___subsf3
        addq.l #8,sp
        move.l (-4,a5),-(sp)
        move.l d0,-(sp)
        jsr ___divsf3
        addq.l #8,sp
        move.l d0,4+_rotationMatrix
        move.l a5,a0
        move.l (20,a0),-(sp)
        move.l a5,a0
        move.l (12,a0),-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d2
        move.l (8,a5),-(sp)
        jsr __ZSt3cosf
        addq.l #4,sp
        move.l d0,-(sp)
        move.l #0x3f800000,-(sp)
        jsr ___subsf3
        addq.l #8,sp
        move.l d0,-(sp)
        move.l d2,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d2
        move.l (-4,a5),-(sp)
        jsr __ZSt4sqrtf
        addq.l #4,sp
        move.l a5,a0
        move.l (16,a0),-(sp)
        move.l d0,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d3
        move.l (8,a5),-(sp)
        jsr __ZSt3sinf
        addq.l #4,sp
        move.l d0,-(sp)
        move.l d3,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,-(sp)
        move.l d2,-(sp)
        jsr ___addsf3
        addq.l #8,sp
        move.l (-4,a5),-(sp)
        move.l d0,-(sp)
        jsr ___divsf3
        addq.l #8,sp
        move.l d0,8+_rotationMatrix
        clr.l 12+_rotationMatrix
        move.l a5,a0
        move.l (16,a0),-(sp)
        move.l a5,a0
        move.l (12,a0),-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d2
        move.l (8,a5),-(sp)
        jsr __ZSt3cosf
        addq.l #4,sp
        move.l d0,-(sp)
        move.l #0x3f800000,-(sp)
        jsr ___subsf3
        addq.l #8,sp
        move.l d0,-(sp)
        move.l d2,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d2
        move.l (-4,a5),-(sp)
        jsr __ZSt4sqrtf
        addq.l #4,sp
        move.l a5,a0
        move.l (20,a0),-(sp)
        move.l d0,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d3
        move.l (8,a5),-(sp)
        jsr __ZSt3sinf
        addq.l #4,sp
        move.l d0,-(sp)
        move.l d3,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,-(sp)
        move.l d2,-(sp)
        jsr ___addsf3
        addq.l #8,sp
        move.l (-4,a5),-(sp)
        move.l d0,-(sp)
        jsr ___divsf3
        addq.l #8,sp
        move.l d0,16+_rotationMatrix
        move.l (-16,a5),-(sp)
        move.l (-8,a5),-(sp)
        jsr ___addsf3
        addq.l #8,sp
        move.l d0,d2
        move.l (8,a5),-(sp)
        jsr __ZSt3cosf
        addq.l #4,sp
        move.l d0,-(sp)
        move.l d2,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l (-12,a5),-(sp)
        move.l d0,-(sp)
        jsr ___addsf3
        addq.l #8,sp
        move.l (-4,a5),-(sp)
        move.l d0,-(sp)
        jsr ___divsf3
        addq.l #8,sp
        move.l d0,20+_rotationMatrix
        move.l a5,a0
        move.l (20,a0),-(sp)
        move.l a5,a0
        move.l (16,a0),-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d2
        move.l (8,a5),-(sp)
        jsr __ZSt3cosf
        addq.l #4,sp
        move.l d0,-(sp)
        move.l #0x3f800000,-(sp)
        jsr ___subsf3
        addq.l #8,sp
        move.l d0,-(sp)
        move.l d2,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d2
        move.l (-4,a5),-(sp)
        jsr __ZSt4sqrtf
        addq.l #4,sp
        move.l a5,a0
        move.l (12,a0),-(sp)
        move.l d0,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d3
        move.l (8,a5),-(sp)
        jsr __ZSt3sinf
        addq.l #4,sp
        move.l d0,-(sp)
        move.l d3,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,-(sp)
        move.l d2,-(sp)
        jsr ___subsf3
        addq.l #8,sp
        move.l (-4,a5),-(sp)
        move.l d0,-(sp)
        jsr ___divsf3
        addq.l #8,sp
        move.l d0,24+_rotationMatrix
        clr.l 28+_rotationMatrix
        move.l a5,a0
        move.l (20,a0),-(sp)
        move.l a5,a0
        move.l (12,a0),-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d2
        move.l (8,a5),-(sp)
        jsr __ZSt3cosf
        addq.l #4,sp
        move.l d0,-(sp)
        move.l #0x3f800000,-(sp)
        jsr ___subsf3
        addq.l #8,sp
        move.l d0,-(sp)
        move.l d2,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d2
        move.l (-4,a5),-(sp)
        jsr __ZSt4sqrtf
        addq.l #4,sp
        move.l a5,a0
        move.l (16,a0),-(sp)
        move.l d0,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d3
        move.l (8,a5),-(sp)
        jsr __ZSt3sinf
        addq.l #4,sp
        move.l d0,-(sp)
        move.l d3,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,-(sp)
        move.l d2,-(sp)
        jsr ___subsf3
        addq.l #8,sp
        move.l (-4,a5),-(sp)
        move.l d0,-(sp)
        jsr ___divsf3
        addq.l #8,sp
        move.l d0,32+_rotationMatrix
        move.l a5,a0
        move.l (20,a0),-(sp)
        move.l a5,a0
        move.l (16,a0),-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d2
        move.l (8,a5),-(sp)
        jsr __ZSt3cosf
        addq.l #4,sp
        move.l d0,-(sp)
        move.l #0x3f800000,-(sp)
        jsr ___subsf3
        addq.l #8,sp
        move.l d0,-(sp)
        move.l d2,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d2
        move.l (-4,a5),-(sp)
        jsr __ZSt4sqrtf
        addq.l #4,sp
        move.l a5,a0
        move.l (12,a0),-(sp)
        move.l d0,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,d3
        move.l (8,a5),-(sp)
        jsr __ZSt3sinf
        addq.l #4,sp
        move.l d0,-(sp)
        move.l d3,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l d0,-(sp)
        move.l d2,-(sp)
        jsr ___addsf3
        addq.l #8,sp
        move.l (-4,a5),-(sp)
        move.l d0,-(sp)
        jsr ___divsf3
        addq.l #8,sp
        move.l d0,36+_rotationMatrix
        move.l (-12,a5),-(sp)
        move.l (-8,a5),-(sp)
        jsr ___addsf3
        addq.l #8,sp
        move.l d0,d2
        move.l (8,a5),-(sp)
        jsr __ZSt3cosf
        addq.l #4,sp
        move.l d0,-(sp)
        move.l d2,-(sp)
        jsr ___mulsf3
        addq.l #8,sp
        move.l (-16,a5),-(sp)
        move.l d0,-(sp)
        jsr ___addsf3
        addq.l #8,sp
        move.l (-4,a5),-(sp)
        move.l d0,-(sp)
        jsr ___divsf3
        addq.l #8,sp
        move.l d0,40+_rotationMatrix
        clr.l 44+_rotationMatrix
        clr.l 48+_rotationMatrix
        clr.l 52+_rotationMatrix
        clr.l 56+_rotationMatrix
        move.l #0x3f800000,60+_rotationMatrix
        nop
        move.l (-24,a5),d2
        move.l (-20,a5),d3
        unlk a5
        rts
.LC3:
        .ascii "Enter the initial point you want to transform:\0"
.LC4:
        .ascii "Enter axis vector: \0"
.LC5:
        .ascii "Enter the rotating angle in degree: \0"
_main:
        link.w a5,#-16
        pea .LC3
        pea __ZSt4cout
        jsr __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
        addq.l #8,sp
        pea _points
        pea __ZSt3cin
        jsr __ZNSirsERf
        addq.l #8,sp
        pea 4+_points
        move.l d0,-(sp)
        jsr __ZNSirsERf
        addq.l #8,sp
        pea 8+_points
        move.l d0,-(sp)
        jsr __ZNSirsERf
        addq.l #8,sp
        move.l _points,d0
        move.l d0,_inputMatrix
        move.l 4+_points,d0
        move.l d0,4+_inputMatrix
        move.l 8+_points,d0
        move.l d0,8+_inputMatrix
        move.l #0x3f800000,12+_inputMatrix
        pea .LC4
        pea __ZSt4cout
        jsr __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
        addq.l #8,sp
        move.l a5,d0
        subq.l #8,d0
        move.l d0,-(sp)
        pea __ZSt3cin
        jsr __ZNSirsERf
        addq.l #8,sp
        lea (-12,a5),a0
        move.l a0,-(sp)
        move.l d0,-(sp)
        jsr __ZNSirsERf
        addq.l #8,sp
        lea (-16,a5),a0
        move.l a0,-(sp)
        move.l d0,-(sp)
        jsr __ZNSirsERf
        addq.l #8,sp
        pea .LC5
        pea __ZSt4cout
        jsr __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
        addq.l #8,sp
        move.l a5,d0
        subq.l #4,d0
        move.l d0,-(sp)
        pea __ZSt3cin
        jsr __ZNSirsERf
        addq.l #8,sp
        move.l (-16,a5),a1
        move.l (-12,a5),a0
        move.l (-8,a5),d1
        move.l (-4,a5),d0
        move.l a1,-(sp)
        move.l a0,-(sp)
        move.l d1,-(sp)
        move.l d0,-(sp)
        jsr __Z19setUpRotationMatrixffff
        lea (16,sp),sp
        jsr __Z14multiplyMatrixv
        jsr __Z9showPointv
        moveq #0,d0
        unlk a5
        rts
__Z41__static_initialization_and_destruction_0ii:
        link.w a5,#0
        moveq #1,d0
        cmp.l (8,a5),d0
        jne .L22
        move.l a5,a0
        cmp.l #65535,(12,a0)
        jne .L22
        pea __ZStL8__ioinit
        jsr __ZNSt8ios_base4InitC1Ev [complete object constructor] [complete object constructor]
        addq.l #4,sp
.L22:
        tst.l (8,a5)
        jne .L24
        move.l a5,a0
        cmp.l #65535,(12,a0)
        jne .L24
        pea __ZStL8__ioinit
        jsr __ZNSt8ios_base4InitD1Ev [complete object destructor] [complete object destructor]
        addq.l #4,sp
.L24:
        nop
        unlk a5
        rts
        link.w a5,#0
        move.l #65535,-(sp)
        pea 1.w
        jsr __Z41__static_initialization_and_destruction_0ii
        addq.l #8,sp
        unlk a5
        rts
        link.w a5,#0
        move.l #65535,-(sp)
        clr.l -(sp)
        jsr __Z41__static_initialization_and_destruction_0ii
        addq.l #8,sp
        unlk a5
        rts
        __EH_FRAME_OBJECT__:




Steve Ferrell

Posts 424
01 Jul 2019 17:25


Stefan "Bebbo" Franke wrote:

Steve Ferrell wrote:

  This is probably more appropriate:
  ...
  void multiplyMatrix()
  {
      for(int i = 0; i < 4; i++ ){
          for(int j = 0; j < 1; j++){
              outputMatrix[j] = 0;
              for(int k = 0; k < 4; k++){
                  outputMatrix[j] += rotationMatrix[k] * inputMatrix[k][j];
              }
          }
      }
  }
  ...
 

 
  does not compile

And here's the 2.95.3 output:

_inputMatrix:
        .long 0x0
        .long 0x0
        .long 0x0
        .long 0x0
_outputMatrix:
        .long 0x0
        .long 0x0
        .long 0x0
        .long 0x0
LC0:
        .ascii ")\0"
LC1:
        .ascii ",\0"
LC2:
        .ascii "(\0"
_showPoint(void):
        link a5,#0
        pea _endl__FR7ostream
        pea LC0
        movel _outputMatrix+8,sp@-
        pea LC1
        movel _outputMatrix+4,sp@-
        pea LC1
        movel _outputMatrix,sp@-
        pea LC2
        pea _cout
        jbsr ostream::___ls(char const *)
        addql #8,sp
        movel d0,sp@-
        jbsr ostream::___ls(float)
        addql #8,sp
        movel d0,sp@-
        jbsr ostream::___ls(char const *)
        addql #8,sp
        movel d0,sp@-
        jbsr ostream::___ls(float)
        addql #8,sp
        movel d0,sp@-
        jbsr ostream::___ls(char const *)
        addql #8,sp
        movel d0,sp@-
        jbsr ostream::___ls(float)
        addql #8,sp
        movel d0,sp@-
        jbsr ostream::___ls(char const *)
        addql #8,sp
        movel d0,sp@-
        jbsr ostream::___ls(ostream &(*)(ostream &))
        addql #8,sp
        unlk a5
        rts
_multiplyMatrix(void):
        link a5,#-12
        moveml #0x3830,sp@-
        clrl a5@(-4)
L290:
        moveq #3,d0
        cmpl a5@(-4),d0
        jge L293
        jra L291
L293:
        clrl a5@(-8)
L294:
        tstl a5@(-8)
        jle L297
        jra L292
L297:
        movel a5@(-8),d0
        addl a5@(-4),d0
        movel d0,d1
        movel d1,d0
        lsll #2,d0
        lea _outputMatrix,a0
        clrl a0@(d0:l)
        clrl a5@(-12)
L298:
        moveq #3,d0
        cmpl a5@(-12),d0
        jge L301
        jra L296
L301:
        movel a5@(-8),d0
        addl a5@(-4),d0
        movel d0,d1
        movel d1,d2
        lsll #2,d2
        lea _outputMatrix,a2
        movel a5@(-8),d0
        addl a5@(-4),d0
        movel d0,d1
        movel d1,d3
        lsll #2,d3
        lea _outputMatrix,a3
        movel a5@(-12),d0
        movel d0,d1
        movel d1,d0
        lsll #2,d0
        movel a5@(-4),d1
        movel d1,d4
        movel d4,d1
        lsll #4,d1
        addl d1,d0
        lea _rotationMatrix,a0
        movel a5@(-8),d1
        addl a5@(-12),d1
        movel d1,d4
        movel d4,d1
        lsll #2,d1
        lea _inputMatrix,a1
        movel a1@(d1:l),sp@-
        movel a0@(d0:l),sp@-
        jbsr ___mulsf3
        addql #8,sp
        movel d0,sp@-
        movel a3@(d3:l),sp@-
        jbsr ___addsf3
        addql #8,sp
        movel d0,a2@(d2:l)
        addql #1,a5@(-12)
        jra L298
L296:
        addql #1,a5@(-8)
        jra L294
L292:
        addql #1,a5@(-4)
        jra L290
L291:
        moveml a5@(-32),#0xc1c
        unlk a5
        rts
_setUpRotationMatrix(float, float, float, float):
        link a5,#-16
        moveml #0x3c00,sp@-
        movel a5@(12),sp@-
        movel a5@(12),sp@-
        jbsr ___mulsf3
        addql #8,sp
        movel d0,d2
        movel a5@(16),sp@-
        movel a5@(16),sp@-
        jbsr ___mulsf3
        addql #8,sp
        movel d0,sp@-
        movel d2,sp@-
        jbsr ___addsf3
        addql #8,sp
        movel d0,d2
        movel a5@(20),sp@-
        movel a5@(20),sp@-
        jbsr ___mulsf3
        addql #8,sp
        movel d0,sp@-
        movel d2,sp@-
        jbsr ___addsf3
        addql #8,sp
        movel d0,a5@(-4)
        movel a5@(8),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel #1413754136,sp@-
        movel #1074340347,sp@-
        movel d1,sp@-
        movel d0,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        clrl sp@-
        movel #1080459264,sp@-
        movel d1,sp@-
        movel d0,sp@-
        jbsr ___divdf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr ___truncdfsf2
        addql #8,sp
        movel d0,a5@(8)
        movel a5@(12),sp@-
        movel a5@(12),sp@-
        jbsr ___mulsf3
        addql #8,sp
        movel d0,a5@(-8)
        movel a5@(16),sp@-
        movel a5@(16),sp@-
        jbsr ___mulsf3
        addql #8,sp
        movel d0,a5@(-12)
        movel a5@(20),sp@-
        movel a5@(20),sp@-
        jbsr ___mulsf3
        addql #8,sp
        movel d0,a5@(-16)
        movel a5@(-8),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d0,d2
        movel d1,d3
        movel a5@(-16),sp@-
        movel a5@(-12),sp@-
        jbsr ___addsf3
        addql #8,sp
        movel d0,sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d0,d4
        movel d1,d5
        movel a5@(8),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _cos
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d5,sp@-
        movel d4,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___adddf3
        lea sp@(16),sp
        movel d0,d2
        movel d1,d3
        movel a5@(-4),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___divdf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr ___truncdfsf2
        addql #8,sp
        movel d0,_rotationMatrix
        movel a5@(16),sp@-
        movel a5@(12),sp@-
        jbsr ___mulsf3
        addql #8,sp
        movel d0,sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d0,d2
        movel d1,d3
        movel a5@(8),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _cos
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        clrl sp@-
        movel #1072693248,sp@-
        jbsr ___subdf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d0,d2
        movel d1,d3
        movel a5@(20),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d0,d4
        movel d1,d5
        movel a5@(-4),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _sqrt
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d5,sp@-
        movel d4,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d0,d4
        movel d1,d5
        movel a5@(8),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _sin
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d5,sp@-
        movel d4,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___subdf3
        lea sp@(16),sp
        movel d0,d2
        movel d1,d3
        movel a5@(-4),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___divdf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr ___truncdfsf2
        addql #8,sp
        movel d0,_rotationMatrix+4
        movel a5@(20),sp@-
        movel a5@(12),sp@-
        jbsr ___mulsf3
        addql #8,sp
        movel d0,sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d0,d2
        movel d1,d3
        movel a5@(8),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _cos
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        clrl sp@-
        movel #1072693248,sp@-
        jbsr ___subdf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d0,d2
        movel d1,d3
        movel a5@(16),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d0,d4
        movel d1,d5
        movel a5@(-4),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _sqrt
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d5,sp@-
        movel d4,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d0,d4
        movel d1,d5
        movel a5@(8),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _sin
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d5,sp@-
        movel d4,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___adddf3
        lea sp@(16),sp
        movel d0,d2
        movel d1,d3
        movel a5@(-4),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___divdf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr ___truncdfsf2
        addql #8,sp
        movel d0,_rotationMatrix+8
        clrl _rotationMatrix+12
        movel a5@(16),sp@-
        movel a5@(12),sp@-
        jbsr ___mulsf3
        addql #8,sp
        movel d0,sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d0,d2
        movel d1,d3
        movel a5@(8),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _cos
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        clrl sp@-
        movel #1072693248,sp@-
        jbsr ___subdf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d0,d2
        movel d1,d3
        movel a5@(20),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d0,d4
        movel d1,d5
        movel a5@(-4),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _sqrt
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d5,sp@-
        movel d4,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d0,d4
        movel d1,d5
        movel a5@(8),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _sin
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d5,sp@-
        movel d4,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___adddf3
        lea sp@(16),sp
        movel d0,d2
        movel d1,d3
        movel a5@(-4),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___divdf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr ___truncdfsf2
        addql #8,sp
        movel d0,_rotationMatrix+16
        movel a5@(-12),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d0,d2
        movel d1,d3
        movel a5@(-16),sp@-
        movel a5@(-8),sp@-
        jbsr ___addsf3
        addql #8,sp
        movel d0,sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d0,d4
        movel d1,d5
        movel a5@(8),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _cos
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d5,sp@-
        movel d4,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___adddf3
        lea sp@(16),sp
        movel d0,d2
        movel d1,d3
        movel a5@(-4),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___divdf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr ___truncdfsf2
        addql #8,sp
        movel d0,_rotationMatrix+20
        movel a5@(20),sp@-
        movel a5@(16),sp@-
        jbsr ___mulsf3
        addql #8,sp
        movel d0,sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d0,d2
        movel d1,d3
        movel a5@(8),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _cos
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        clrl sp@-
        movel #1072693248,sp@-
        jbsr ___subdf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d0,d2
        movel d1,d3
        movel a5@(12),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d0,d4
        movel d1,d5
        movel a5@(-4),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _sqrt
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d5,sp@-
        movel d4,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d0,d4
        movel d1,d5
        movel a5@(8),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _sin
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d5,sp@-
        movel d4,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___subdf3
        lea sp@(16),sp
        movel d0,d2
        movel d1,d3
        movel a5@(-4),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___divdf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr ___truncdfsf2
        addql #8,sp
        movel d0,_rotationMatrix+24
        clrl _rotationMatrix+28
        movel a5@(20),sp@-
        movel a5@(12),sp@-
        jbsr ___mulsf3
        addql #8,sp
        movel d0,sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d0,d2
        movel d1,d3
        movel a5@(8),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _cos
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        clrl sp@-
        movel #1072693248,sp@-
        jbsr ___subdf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d0,d2
        movel d1,d3
        movel a5@(16),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d0,d4
        movel d1,d5
        movel a5@(-4),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _sqrt
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d5,sp@-
        movel d4,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d0,d4
        movel d1,d5
        movel a5@(8),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _sin
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d5,sp@-
        movel d4,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___subdf3
        lea sp@(16),sp
        movel d0,d2
        movel d1,d3
        movel a5@(-4),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___divdf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr ___truncdfsf2
        addql #8,sp
        movel d0,_rotationMatrix+32
        movel a5@(20),sp@-
        movel a5@(16),sp@-
        jbsr ___mulsf3
        addql #8,sp
        movel d0,sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d0,d2
        movel d1,d3
        movel a5@(8),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _cos
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        clrl sp@-
        movel #1072693248,sp@-
        jbsr ___subdf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d0,d2
        movel d1,d3
        movel a5@(12),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d0,d4
        movel d1,d5
        movel a5@(-4),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _sqrt
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d5,sp@-
        movel d4,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d0,d4
        movel d1,d5
        movel a5@(8),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _sin
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d5,sp@-
        movel d4,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___adddf3
        lea sp@(16),sp
        movel d0,d2
        movel d1,d3
        movel a5@(-4),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___divdf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr ___truncdfsf2
        addql #8,sp
        movel d0,_rotationMatrix+36
        movel a5@(-16),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d0,d2
        movel d1,d3
        movel a5@(-12),sp@-
        movel a5@(-8),sp@-
        jbsr ___addsf3
        addql #8,sp
        movel d0,sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d0,d4
        movel d1,d5
        movel a5@(8),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr _cos
        addql #8,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d5,sp@-
        movel d4,sp@-
        jbsr ___muldf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___adddf3
        lea sp@(16),sp
        movel d0,d2
        movel d1,d3
        movel a5@(-4),sp@-
        jbsr ___extendsfdf2
        addql #4,sp
        movel d1,sp@-
        movel d0,sp@-
        movel d3,sp@-
        movel d2,sp@-
        jbsr ___divdf3
        lea sp@(16),sp
        movel d1,sp@-
        movel d0,sp@-
        jbsr ___truncdfsf2
        addql #8,sp
        movel d0,_rotationMatrix+40
        clrl _rotationMatrix+44
        clrl _rotationMatrix+48
        clrl _rotationMatrix+52
        clrl _rotationMatrix+56
        movel #0x3f800000,_rotationMatrix+60
        moveml a5@(-32),#0x3c
        unlk a5
        rts
LC3:
        .ascii "Enter the initial point you want to transform:\0"
LC4:
        .ascii "Enter axis vector: \0"
LC5:
        .ascii "Enter the rotating angle in degree: \0"
_main:
        link a5,#-16
        jbsr ___main
        pea LC3
        pea _cout
        jbsr ostream::___ls(char const *)
        addql #8,sp
        pea _points+8
        pea _points+4
        pea _points
        pea _cin
        jbsr istream::___rs(float &)
        addql #8,sp
        movel d0,sp@-
        jbsr istream::___rs(float &)
        addql #8,sp
        movel d0,sp@-
        jbsr istream::___rs(float &)
        addql #8,sp
        movel _points,_inputMatrix
        movel _points+4,_inputMatrix+4
        movel _points+8,_inputMatrix+8
        movel #0x3f800000,_inputMatrix+12
        pea LC4
        pea _cout
        jbsr ostream::___ls(char const *)
        addql #8,sp
        moveq #-16,d0
        addl a5,d0
        movel d0,sp@-
        moveq #-12,d0
        addl a5,d0
        movel d0,sp@-
        movel a5,d0
        subql #8,d0
        movel d0,sp@-
        pea _cin
        jbsr istream::___rs(float &)
        addql #8,sp
        movel d0,sp@-
        jbsr istream::___rs(float &)
        addql #8,sp
        movel d0,sp@-
        jbsr istream::___rs(float &)
        addql #8,sp
        pea LC5
        pea _cout
        jbsr ostream::___ls(char const *)
        addql #8,sp
        movel a5,d0
        subql #4,d0
        movel d0,sp@-
        pea _cin
        jbsr istream::___rs(float &)
        addql #8,sp
        movel a5@(-16),sp@-
        movel a5@(-12),sp@-
        movel a5@(-8),sp@-
        movel a5@(-4),sp@-
        jbsr _setUpRotationMatrix(float, float, float, float)
        lea sp@(16),sp
        jbsr _multiplyMatrix(void)
        jbsr _showPoint(void)
        moveq #0,d0
        jra L303
        moveq #0,d0
        jra L303
L303:
        unlk a5
        rts
_points:
_rotationMatrix:



Stefan "Bebbo" Franke

Posts 139
01 Jul 2019 19:20


Steve Ferrell wrote:

 
Stefan "Bebbo" Franke wrote:

 
Steve Ferrell wrote:

    This is probably more appropriate:
    ...
    void multiplyMatrix()
    {
        for(int i = 0; i < 4; i++ ){
            for(int j = 0; j < 1; j++){
                outputMatrix[j] = 0;
                for(int k = 0; k < 4; k++){
                    outputMatrix[j] += rotationMatrix[k] * inputMatrix[k][j];
                }
            }
        }
    }
    ...
   

   
    does not compile
 

 
  And here's the 2.95.3 output:
   
 

 
  you are using something different. The posted code does not compile with 2.95.3 either - see EXTERNAL LINK 
and if I add some obiously missing indexes, gcc 6.5.0b emits:

__Z14multiplyMatrixv:
        fmovem #60,-(sp)
        lea _inputMatrix,a0
        fsmove.s (a0)+,fp0
        fsmove.s (a0)+,fp4
        fsmove.s (a0)+,fp3
        fsmove.s (a0),fp2
        lea _outputMatrix,a1
        fsmove.x fp0,fp1
        lea _rotationMatrix,a0
        fsmul.s (a0),fp1
        fsmove.x fp4,fp5
        fsmul.s (16,a0),fp5
        fsadd.s #0x0,fp1
        fsadd.x fp5,fp1
        fsmove.x fp3,fp5
        fsmul.s (32,a0),fp5
        fsadd.x fp5,fp1
        fsmove.x fp2,fp5
        fsmul.s (48,a0),fp5
        fsadd.x fp5,fp1
        fmove.s fp1,(a1)+
        fsmove.x fp0,fp1
        fsmul.s (4,a0),fp1
        fsmove.x fp4,fp5
        fsmul.s (20,a0),fp5
        fsadd.s #0x0,fp1
        fsadd.x fp5,fp1
        fsmove.x fp3,fp5
        fsmul.s (36,a0),fp5
        fsadd.x fp5,fp1
        fsmove.x fp2,fp5
        fsmul.s (52,a0),fp5
        fsadd.x fp5,fp1
        fmove.s fp1,(a1)+
        fsmove.x fp0,fp1
        fsmul.s (8,a0),fp1
        fsmove.x fp4,fp5
        fsmul.s (24,a0),fp5
        fsadd.s #0x0,fp1
        fsadd.x fp5,fp1
        fsmove.x fp3,fp5
        fsmul.s (40,a0),fp5
        fsadd.x fp5,fp1
        fsmove.x fp2,fp5
        fsmul.s (56,a0),fp5
        fsadd.x fp5,fp1
        fmove.s fp1,(a1)+
        fsmul.s (12,a0),fp0
        fsmul.s (28,a0),fp4
        fsadd.s #0x0,fp0
        fsmul.s (44,a0),fp3
        fsadd.x fp4,fp0
        fsmul.s (60,a0),fp2
        fsadd.x fp3,fp0
        fsadd.x fp2,fp0
        fmove.s fp0,(a1)
        fmovem (sp)+,#60
        rts




Gunnar von Boehn
(Apollo Team Member)
Posts 6207
01 Jul 2019 19:40


Bebbo lets see how good we can tune this.

As mentioned APOLLO is a 3 operant machine


.L2:
          fdmove.x fp0,fp1
          fdmul.d  (a1)+,fp1

          fdmove.x fp0,fp2
          fdmul.d  (a1)+,fp2

          fdmove.x fp0,fp3
          fdmove.d (a1)+,fp3

          fdmul.x  fp0,fp4
          fdmul.d  (a1)+,fp4

          fdmul.x  fp0,fp5
          fdmul.d  (a1)+,fp5

          fmove.d  fp1,(a0)+

          fmove.d  fp2,(a0)+

          fmove.d  fp3,(a0)+

          fmove.d  fp4,(a0)+

          fmove.d  fp5,(a0)+
            subq.l #1,d0
            jne .L2

Code like the above with 17 instruction and 15 FPU instruction
Could ideally be executed in 10 clocks.

We should see if we can get it this fast.


Stefan "Bebbo" Franke

Posts 139
01 Jul 2019 21:51


I'm getting closer:

.L2:
        fmove.x fp0,fp4
        fmove.x fp0,fp3
        fmul.d (a1)+,fp4
        fmove.x fp0,fp2
        fmul.d (a1)+,fp3
        fmove.x fp0,fp1
        fmul.d (a1)+,fp2
        fmul.d (a1)+,fp1
        fmove.d fp4,(a0)+
        fmove.d fp3,(a0)+
        fmove.d fp2,(a0)+
        fmove.d fp1,(a0)+
        cmp.l a0,a2
        jne .L2



Steve Ferrell

Posts 424
01 Jul 2019 22:34


@Stefan "Bebbo" Franke

The Apollo forum software must be modifying the code when I paste to this thread, or it's being modified when you copy it out.

Try this link:  EXTERNAL LINK


Gunnar von Boehn
(Apollo Team Member)
Posts 6207
02 Jul 2019 05:02


Stefan "Bebbo" Franke wrote:

  I'm getting closer:
 

  .L2:
          fmove.x fp0,fp4
          fmove.x fp0,fp3
          fmul.d (a1)+,fp4
          fmove.x fp0,fp2
          fmul.d (a1)+,fp3
          fmove.x fp0,fp1
          fmul.d (a1)+,fp2
          fmul.d (a1)+,fp1
          fmove.d fp4,(a0)+
          fmove.d fp3,(a0)+
          fmove.d fp2,(a0)+
          fmove.d fp1,(a0)+
          cmp.l a0,a2
          jne .L2
 

 

 
 
Much better!
Very good!
 
Now the Loop should do "SUBQ.l #1,D0"
Why does GCC like to use CMPA so much?
 
A "SUBQ.L #1,D0" would be independent of "fmove.d fp1,(a0)+"
And can be ran in the same clock.
The CMPA is depending and needs a new cycle.
 
 
The FMUL should come right after the FMOVE,
as this would create the lowest latency.
Why does GCC do two MOVE first?

Could GCC also throw a 3-Operant instruction?
FMUL.d (A1),Fp0,Fp1 ?



Stefan "Bebbo" Franke

Posts 139
02 Jul 2019 06:02


Gunnar von Boehn wrote:

Stefan "Bebbo" Franke wrote:

  I'm getting closer:
   

    .L2:
            fmove.x fp0,fp4
            fmove.x fp0,fp3
            fmul.d (a1)+,fp4
            fmove.x fp0,fp2
            fmul.d (a1)+,fp3
            fmove.x fp0,fp1
            fmul.d (a1)+,fp2
            fmul.d (a1)+,fp1
            fmove.d fp4,(a0)+
            fmove.d fp3,(a0)+
            fmove.d fp2,(a0)+
            fmove.d fp1,(a0)+
            cmp.l a0,a2
            jne .L2
   

 

 
 
  Much better!
  Very good!
 
  Now the Loop should do "SUBQ.l #1,D0"
  Why does GCC like to use CMPA so much?

  - first gcc transforms all memory accesses into base[offset-expression]
  - then gcc introduces new loop variables, by comparing the costs of various approaches. That stage does not know about insn scheduling

the disadvantage is, that C code which would directly translate into asm is messed up.
the advantage is, that it does not matter how good your C code is the result is always the same "good" loop.

=> it's a lot about playing with costs plus tweak and hack some cost calculations to get the desired result.

gcc is a configurable monster, all about costs, constraints, dependencies. If everything matches, the generated code is good.

Gunnar von Boehn wrote:

  The FMUL should come right after the FMOVE,
  as this would create the lowest latency.
  Why does GCC do two MOVE first?


 
aren't there enough insn inbetween?

Gunnar von Boehn wrote:

  Could GCC also throw a 3-Operant instruction?
  FMUL.d (A1),Fp0,Fp1 ?

sure thing - if there is an assembler which handles that.

btw: if your loop executes in 10 cycles then the fmove.x fp0,fp4 are handled by the integer units!?


Gunnar von Boehn
(Apollo Team Member)
Posts 6207
02 Jul 2019 06:17


Stefan "Bebbo" Franke wrote:

 
Gunnar von Boehn wrote:
 
  The FMUL should come right after the FMOVE,
  as this would create the lowest latency.
  Why does GCC do two MOVE first?
 

 
  aren't there enough insn inbetween?

What I wanted to say was:

FMOVE (READ) has no latency.
The next cycle its result can be used.
This means the FMUL can come immediately after FMOVE (read).

The FMUL has a significant latency.
We want to start the FMUL as soon as possible.
They should start right after the FMOVE.

The FMOVE (STORE) has to wait for the FMUL to finish.
There should be maximum distance between those instructions.

 
Stefan "Bebbo" Franke wrote:

 
Gunnar von Boehn wrote:

  Could GCC also throw a 3-Operant instruction?
  FMUL.d (A1),Fp0,Fp1 ?
 

 
  sure thing - if there is an assembler which handles that.

Yes, I can offer you opcodes for this.

Stefan "Bebbo" Franke wrote:

  btw: if your loop executes in 10 cycles then the fmove.x fp0,fp4 are handled by the integer units!?

APOLLO is internally a 3 operant machine.
The MOVE can be part of a real calculation operation.

We have 2 choices now.
A) Use a new opcode 3-Op.
This would also allow you to access all 32 FPU regs.
32 Regs will allow even complicated calculations to be nicely unrolled.

B) Support an encoder FUSING of both MOVE end FMUL.
(-) This will only allow you to use 8regs,
(-) The generated code would be bigger (2 instr instead 1)
(+) But the generated code is backward compatible and would run on old FPUs too.

What do you think?


Gunnar von Boehn
(Apollo Team Member)
Posts 6207
02 Jul 2019 09:39


Bebbo,

how does GCC count latency.
Is the lowest LATENCY=1 or =0 ?
If the result can be used the next cycle, do you define this a LATENCY=0 in GCC?



Stefan "Bebbo" Franke

Posts 139
02 Jul 2019 10:52


Gunnar von Boehn wrote:

Bebbo,
 
  how does GCC count latency.
  Is the lowest LATENCY=1 or =0 ?
  If the result can be used the next cycle, do you define this a LATENCY=0 in GCC?
 

You define the used cycle count, e.g. fmul uses 6. the latency is calculated based on the used resources.

Read here EXTERNAL LINK 
(I followed that example, but I don't get yet what I'd expect)




Gunnar von Boehn
(Apollo Team Member)
Posts 6207
03 Jul 2019 10:52


He Bebbo,

Could you find out why GCC uses CMPA?
CMPA makes registered dependancy - so this is not optimal


Stefan "Bebbo" Franke

Posts 139
03 Jul 2019 10:54


Gunnar von Boehn wrote:

He Bebbo,
 
  Could you find out why GCC uses CMPA?
  CMPA makes registered dependancy - so this is not optimal

with the current costs: because it saves an additional sub/add :-)



Gunnar von Boehn
(Apollo Team Member)
Posts 6207
03 Jul 2019 11:01


Stefan "Bebbo" Franke wrote:

 
Gunnar von Boehn wrote:

  He Bebbo,
   
    Could you find out why GCC uses CMPA?
    CMPA makes registered dependancy - so this is not optimal
 

  with the current costs: because it saves an additional sub/add :-)
 

 
Sorry, I not understand the answer.
 
You have either the CMPA or the SUBQ.
The CMPA not saves an instruciton.
In fact the CMPA has register dependency - so it can not be executed  in the 2nd pipe for free - so its more costly.
 
Can you please help me understand this better?


Stefan "Bebbo" Franke

Posts 139
03 Jul 2019 15:11


Gunnar von Boehn wrote:

Stefan "Bebbo" Franke wrote:

 
Gunnar von Boehn wrote:

    He Bebbo,
   
    Could you find out why GCC uses CMPA?
    CMPA makes registered dependancy - so this is not optimal
   

    with the current costs: because it saves an additional sub/add :-)
 

 
  Sorry, I not understand the answer.
 
  You have either the CMPA or the SUBQ.
  The CMPA not saves an instruciton.
  In fact the CMPA has register dependency - so it can not be executed  in the 2nd pipe for free - so its more costly.
 
  Can you please help me understand this better?

The compiler will also generate a cmp after the sub. A later stage optimizes that cmp away.

My local version adjusts the costs for a cmp after a sub towards 0 to zero, plus raises the costs for a addr-cmp after addr inc, then the sub #1 variant wins


Gunnar von Boehn
(Apollo Team Member)
Posts 6207
04 Jul 2019 11:37


Hallo Bebbo,

hier mal ein Beispiel.
Wie man wirklich sehr schnellen FPU code kriegt.


START:                          ; first instruction of program
        moveq  #10,D7
        fmove.s #2.0,fp0
LOOP
        lea    Data.l,a0
        lea    Data2.l,a1
        dc.w    $7340
        fadd.s  (a0)+,fp0      ; (a0)+ + Fp0 => FP1
        dc.w    $7540
        fadd.s  (a0)+,fp0      ; (a0)+ + Fp0 => FP2
        dc.w    $7740
        fadd.s  (a0)+,fp0      ; (a0)+ + Fp0 => FP3
        dc.w    $7940
        fadd.s  (a0)+,fp0      ; (a0)+ + Fp0 => FP4
        dc.w    $7B40
        fadd.s  (a0)+,fp0      ; (a0)+ + Fp0 => FP5
        dc.w    $7D40
        fadd.s  (a0)+,fp0      ; (a0)+ + Fp0 => FP6

        fmove.s fp1,(a1)+
        fmove.s fp2,(a1)+
        fmove.s fp3,(a1)+
        fmove.s fp4,(a1)+
        fmove.s fp5,(a1)+
        fmove.s fp6,(a1)+

        subq.l  #1,D7
        bne    LOOP




Markus B

Posts 209
10 Jul 2019 10:46


Thank you guys for picking up this topic.
An optimized gcc for 080 will be very benficial.

Once I can purchase the SA, I'm happy to finally dig into programming for Amiga with the help of gcc for 080. ;-) Never done this before, but really looking forward to it.


Stefan "Bebbo" Franke

Posts 139
10 Jul 2019 14:38


have a look at cex: EXTERNAL LINK 
  It's not live yet, since some tests do fail, but it's the wanted result for the Scale() function.
 
  It also shows that newer gcc really do optimize code, whilest gcc-2.95.3 is closer to an exact translation of the provided code. Thus worse C/C++ code may yield the same assembler code with gcc-6 as the "best" C/C++ code.

Plus note the early scheduling of the div in the assembly for this function:


double foo(double a, double b, double c) {
      return c/2 + c * (b-a) + b * b + a * (a + 1) / (a * a - 1);
}


posts 367page  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19