/*---------------------------------------------------------------------------
mul44x41.asm:   Multiply 4x4 by 4x1 Matrices 
                (General 3D Graphics Transformation)
-----------------------------------------------------------------------------
Description:

Performs general 4x4 by 4x1 3D graphics transform on N 4x1 points.  Each
point requires 16 multiplies and 12 additions.

Note that if the transformation is a combination of translation, rotation,
and scaling, the routine transf.asm can be used, saving 5 multiplies and 3
additions per point. This routine is intended to be used in other types of
transformations, such as transforming to the canonical clipping volume, which
can't be done with the matrix in transf.asm.

For a description of 3-D transformations and [x,y,z,w] coordinates, see
Chapter 5 of Foley and VanDam's "Computer Graphics, Principles and Practice",
2nd Edition (Addison-Wesley 1990).

-----------------------------------------------------------------------------
Program Characteristics:

Calling Values:
    REGISTER FILE:
    r9 = number of points to transform

    DAG1 (Data Memory):
    i0 = index to [x,y,z] coordinates
    m0 = +1
    m1 = +4
    i1 = index to transformed point area
    i2 = index to [w] coordinates

    DAG2 (Program Memory):
    i8 = index to 4x4 matrix 
    l8 = 16
    m8 = +1

Registers Used:
    r1, r4-r9, r14-r15

Computation Time = 16N+4 cycles, where N 3-D points are transformed
                 = 640ns per point @ 25MHz
                 = 1.56 million points/sec @ 25MHz

-----------------------------------------------------------------------------
Notes:

This code example can be used in two ways.  If the list of points is a
sequence of [x,y,z,w] coordinates, assemble this code normally ("asm21k
mul44x41").  If the list of points is [x,y,z] coordinates only, and the [w]
coordinate to be used is buried in another list of points, define the
preprocessor variable WOUT with the syntax "asm21k -DWOUT mul44x41".  This
may be needed if transf.asm has been used in previous transformations, since
the transf.asm code doesn't keep the [w] coordinate embedded in the [x,y,z]
coordinate list.

-----------------------------------------------------------------------------
Author:     Jim Donahue, Analog Devices DSP Division
Revised:    12-AUG-91
----------------------------------------------------------------------------*/
.GLOBAL mul44x41;

#ifdef WOUT

#define IP  i0      /* Principal Index, to [x,y,z] list */
#define IS  i2      /* Secondary Index, to [w] list */
#define MS  m1      /* Secondary Modify, for stepping from w to w */

#else

#define IP  i0      /* Principal Index, to [x,y,z] list */
#define IS  i0      /* Secondary Index, to [w] list */
#define MS  m0      /* Secondary Modify, for stepping from w to w */

#endif

.SEGMENT /pm    pm_code;
mul44x41:
    r9 = r9-1,             f4=dm(IP,m0),  f1=pm(i8,m8);
    f15=f1*f4,             f5=dm(IP,m0),  f1=pm(i8,m8);
    f8=f1*f5,              f6=dm(IP,m0),  f1=pm(i8,m8);
    f8=f1*f6,  f15=f8+f15, f7=dm(IS,MS),  f1=pm(i8,m8);
    f8=f1*f7,  f15=f8+f15,                f1=pm(i8,m8);

    lcntr=r9, do trlp until lce;
        f14=f1*f4, f15=f8+f15,                f1=pm(i8,m8);
        f8=f1*f5,              dm(i1,m0)=f15, f1=pm(i8,m8);
        f8=f1*f6,  f14=f8+f14,                f1=pm(i8,m8);
        f8=f1*f7,  f14=f8+f14,                f1=pm(i8,m8);
        f15=f1*f4, f14=f8+f14,                f1=pm(i8,m8);
        f8=f1*f5,              dm(i1,m0)=f14, f1=pm(i8,m8);
        f8=f1*f6,  f15=f8+f15,                f1=pm(i8,m8);
        f8=f1*f7,  f15=f8+f15,                f1=pm(i8,m8);
        f14=f1*f4, f15=f8+f15,                f1=pm(i8,m8);
        f8=f1*f5,              dm(i1,m0)=f15, f1=pm(i8,m8);
        f8=f1*f6,  f14=f8+f14,                f1=pm(i8,m8);

        f8=f1*f7,  f14=f8+f14, f4=dm(IP,m0),  f1=pm(i8,m8);
        f15=f1*f4, f14=f8+f14, f5=dm(IP,m0),  f1=pm(i8,m8);
        f8=f1*f5,              f6=dm(IP,m0),  f1=pm(i8,m8);
        f8=f1*f6,  f15=f8+f15, f7=dm(IS,MS),  f1=pm(i8,m8);
trlp:   f8=f1*f7,  f15=f8+f15, dm(i1,m0)=f14, f1=pm(i8,m8);


    f14=f1*f4, f15=f8+f15,                f1=pm(i8,m8);
    f8=f1*f5,              dm(i1,m0)=f15, f1=pm(i8,m8);
    f8=f1*f6,  f14=f8+f14,                f1=pm(i8,m8);
    f8=f1*f7,  f14=f8+f14,                f1=pm(i8,m8);
    f15=f1*f4, f14=f8+f14,                f1=pm(i8,m8);
    f8=f1*f5,              dm(i1,m0)=f14, f1=pm(i8,m8);
    f8=f1*f6,  f15=f8+f15,                f1=pm(i8,m8);
    f8=f1*f7,  f15=f8+f15,                f1=pm(i8,m8);
    f14=f1*f4, f15=f8+f15,                f1=pm(i8,m8);
    f8=f1*f5,              dm(i1,m0)=f15, f1=pm(i8,m8);
    f8=f1*f6,  f14=f8+f14,                f1=pm(i8,m8);

    rts(db), f8=f1*f7,  f14=f8+f14;
        f14=f8+f14;
        dm(i1,m0)=f14;


.ENDSEG;